daren/apps/rlhf/management/commands/analyze_data.py
2025-06-09 18:00:00 +08:00

368 lines
15 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from django.core.management.base import BaseCommand
from rlhf.models import Conversation, Message, Feedback, DetailedFeedback, FeedbackTag
from django.db.models import Count, Avg, Sum, Q, F
from django.utils import timezone
from django.contrib.auth import get_user_model
import json
from datetime import datetime, timedelta
User = get_user_model()
class Command(BaseCommand):
help = '分析RLHF反馈数据生成统计报告'
def add_arguments(self, parser):
parser.add_argument(
'--export',
action='store_true',
help='导出数据到JSON文件',
)
parser.add_argument(
'--days',
type=int,
default=30,
help='分析最近的天数',
)
def handle(self, *args, **options):
self.stdout.write(self.style.SUCCESS("=" * 60))
self.stdout.write(self.style.SUCCESS("🤖 在线人类反馈强化学习系统 - 数据分析报告"))
self.stdout.write(self.style.SUCCESS("=" * 60))
# 基本统计
feedback_stats = self.get_feedback_stats()
self.stdout.write(self.style.SUCCESS(f"\n📊 反馈统计:"))
self.stdout.write(f" 总反馈数量: {feedback_stats['total_feedback']}")
self.stdout.write(f" 正面反馈: {feedback_stats['positive_feedback']} ({feedback_stats['positive_rate']:.1f}%)")
self.stdout.write(f" 负面反馈: {feedback_stats['negative_feedback']}")
self.stdout.write(f" 平均反馈分数: {feedback_stats['avg_feedback']:.2f}")
# 对话统计
conv_stats = self.get_conversation_stats()
self.stdout.write(self.style.SUCCESS(f"\n💬 对话统计:"))
self.stdout.write(f" 总对话数量: {conv_stats['total_conversations']}")
self.stdout.write(f" 总消息数量: {conv_stats['total_messages']}")
self.stdout.write(f" 平均每对话消息数: {conv_stats['avg_messages_per_conversation']:.1f}")
# 标签统计
tag_stats = self.get_tag_stats()
self.stdout.write(self.style.SUCCESS(f"\n🏷️ 标签统计:"))
self.stdout.write(f" 最常用的正面标签:")
for tag in tag_stats['top_positive']:
self.stdout.write(f" - {tag['tag_name']}: {tag['count']}")
self.stdout.write(f" 最常用的负面标签:")
for tag in tag_stats['top_negative']:
self.stdout.write(f" - {tag['tag_name']}: {tag['count']}")
# 每日趋势
days = options['days']
daily_trend = self.get_daily_feedback_trend(days)
self.stdout.write(self.style.SUCCESS(f"\n📈 最近{days}天反馈趋势:"))
for day in daily_trend:
self.stdout.write(f" {day['date']}: {day['total']}条反馈 (正面率: {day['positive_rate']:.1f}%)")
# 用户统计
user_stats = self.get_user_stats()
self.stdout.write(self.style.SUCCESS(f"\n👥 用户统计:"))
self.stdout.write(f" 总用户数量: {user_stats['total_users']}")
self.stdout.write(f" 活跃标注用户: {user_stats['active_users']}")
self.stdout.write(f" 平均每用户标注量: {user_stats['avg_annotations_per_user']:.1f}")
# 导出数据
if options['export']:
filename = self.export_data_to_json()
self.stdout.write(self.style.SUCCESS(f"\n✅ 数据已导出到: {filename}"))
def get_feedback_stats(self):
"""获取反馈统计信息"""
# 基本反馈统计
basic_feedback = Feedback.objects.aggregate(
total=Count('id'),
positive=Sum(Case(When(feedback_value__gt=0, then=1), default=0)),
negative=Sum(Case(When(feedback_value__lt=0, then=1), default=0)),
avg=Avg('feedback_value')
)
# 详细反馈统计
detailed_feedback = DetailedFeedback.objects.aggregate(
total=Count('id'),
positive=Count('id', filter=Q(feedback_type='positive')),
negative=Count('id', filter=Q(feedback_type='negative'))
)
# 合并统计
total = (basic_feedback['total'] or 0) + (detailed_feedback['total'] or 0)
positive = (basic_feedback['positive'] or 0) + (detailed_feedback['positive'] or 0)
negative = (basic_feedback['negative'] or 0) + (detailed_feedback['negative'] or 0)
# 计算平均分和正面比例
avg_feedback = basic_feedback['avg'] or 0
positive_rate = (positive / total * 100) if total > 0 else 0
return {
'total_feedback': total,
'positive_feedback': positive,
'negative_feedback': negative,
'avg_feedback': avg_feedback,
'positive_rate': positive_rate
}
def get_conversation_stats(self):
"""获取对话统计信息"""
total_conversations = Conversation.objects.count()
total_messages = Message.objects.count()
# 计算每个对话的消息数量分布
conversation_messages = Message.objects.values('conversation').annotate(count=Count('id'))
avg_messages = conversation_messages.aggregate(Avg('count'))['count__avg'] or 0
return {
'total_conversations': total_conversations,
'total_messages': total_messages,
'avg_messages_per_conversation': avg_messages
}
def get_tag_stats(self):
"""获取标签使用统计"""
# 分析DetailedFeedback中的标签使用情况
# 注意由于标签可能存储为JSON字符串这里需要解析
# 首先获取所有的标签
all_tags = FeedbackTag.objects.all()
tag_id_to_name = {str(tag.id): tag.tag_name for tag in all_tags}
# 计算每个标签的使用次数
tag_counts = {}
for feedback in DetailedFeedback.objects.all():
if feedback.feedback_tags:
try:
# 尝试解析JSON标签列表
tag_ids = json.loads(feedback.feedback_tags)
if isinstance(tag_ids, list):
for tag_id in tag_ids:
tag_id = str(tag_id)
if tag_id in tag_counts:
tag_counts[tag_id] += 1
else:
tag_counts[tag_id] = 1
except (json.JSONDecodeError, TypeError):
# 如果不是有效的JSON可能是单个标签ID
tag_id = str(feedback.feedback_tags)
if tag_id in tag_counts:
tag_counts[tag_id] += 1
else:
tag_counts[tag_id] = 1
# 获取排名前5的正面和负面标签
positive_tags = FeedbackTag.objects.filter(tag_type='positive')
negative_tags = FeedbackTag.objects.filter(tag_type='negative')
top_positive = []
for tag in positive_tags:
tag_id = str(tag.id)
if tag_id in tag_counts:
top_positive.append({
'tag_name': tag.tag_name,
'count': tag_counts[tag_id]
})
top_negative = []
for tag in negative_tags:
tag_id = str(tag.id)
if tag_id in tag_counts:
top_negative.append({
'tag_name': tag.tag_name,
'count': tag_counts[tag_id]
})
# 按使用次数排序
top_positive.sort(key=lambda x: x['count'], reverse=True)
top_negative.sort(key=lambda x: x['count'], reverse=True)
# 取前5
return {
'top_positive': top_positive[:5],
'top_negative': top_negative[:5]
}
def get_daily_feedback_trend(self, days=30):
"""获取每日反馈趋势"""
# 计算开始日期
start_date = timezone.now().date() - timedelta(days=days)
# 基本反馈按日期分组
basic_daily = Feedback.objects.filter(timestamp__date__gte=start_date) \
.values('timestamp__date') \
.annotate(
date=F('timestamp__date'),
total=Count('id'),
positive=Sum(Case(When(feedback_value__gt=0, then=1), default=0)),
negative=Sum(Case(When(feedback_value__lt=0, then=1), default=0))
) \
.order_by('date')
# 详细反馈按日期分组
detailed_daily = DetailedFeedback.objects.filter(created_at__date__gte=start_date) \
.values('created_at__date') \
.annotate(
date=F('created_at__date'),
total=Count('id'),
positive=Count('id', filter=Q(feedback_type='positive')),
negative=Count('id', filter=Q(feedback_type='negative'))
) \
.order_by('date')
# 合并两种反馈数据
daily_data = {}
for item in basic_daily:
date_str = item['date'].strftime('%Y-%m-%d')
daily_data[date_str] = {
'date': date_str,
'total': item['total'],
'positive': item['positive'],
'negative': item['negative']
}
for item in detailed_daily:
date_str = item['date'].strftime('%Y-%m-%d')
if date_str in daily_data:
daily_data[date_str]['total'] += item['total']
daily_data[date_str]['positive'] += item['positive']
daily_data[date_str]['negative'] += item['negative']
else:
daily_data[date_str] = {
'date': date_str,
'total': item['total'],
'positive': item['positive'],
'negative': item['negative']
}
# 计算正面反馈比例
for date_str, data in daily_data.items():
data['positive_rate'] = (data['positive'] / data['total'] * 100) if data['total'] > 0 else 0
# 转换为列表并按日期排序
result = list(daily_data.values())
result.sort(key=lambda x: x['date'])
return result
def get_user_stats(self):
"""获取用户统计信息"""
# 总用户数
total_users = User.objects.count()
# 有反馈记录的用户数
users_with_feedback = User.objects.filter(
Q(feedback__isnull=False) | Q(detailed_feedback__isnull=False)
).distinct().count()
# 最近30天活跃的标注用户
thirty_days_ago = timezone.now() - timedelta(days=30)
active_users = User.objects.filter(
Q(feedback__timestamp__gte=thirty_days_ago) |
Q(detailed_feedback__created_at__gte=thirty_days_ago)
).distinct().count()
# 计算每个用户的标注量
user_annotations = {}
for feedback in Feedback.objects.all():
user_id = str(feedback.user_id)
if user_id in user_annotations:
user_annotations[user_id] += 1
else:
user_annotations[user_id] = 1
for feedback in DetailedFeedback.objects.all():
user_id = str(feedback.user_id)
if user_id in user_annotations:
user_annotations[user_id] += 1
else:
user_annotations[user_id] = 1
# 计算平均每用户标注量
if user_annotations:
avg_annotations = sum(user_annotations.values()) / len(user_annotations)
else:
avg_annotations = 0
return {
'total_users': total_users,
'users_with_feedback': users_with_feedback,
'active_users': active_users,
'avg_annotations_per_user': avg_annotations
}
def export_data_to_json(self):
"""导出数据到JSON文件"""
data = {
'conversations': [],
'feedback_summary': self.get_feedback_stats(),
'tag_stats': self.get_tag_stats(),
'daily_trend': self.get_daily_feedback_trend(30),
'export_time': timezone.now().isoformat()
}
# 导出对话和消息数据
for conv in Conversation.objects.all().prefetch_related('messages'):
conv_data = {
'id': str(conv.id),
'created_at': conv.created_at.isoformat(),
'user_id': str(conv.user_id),
'is_submitted': conv.is_submitted,
'messages': []
}
for msg in conv.messages.all().order_by('timestamp'):
msg_data = {
'id': str(msg.id),
'role': msg.role,
'content': msg.content,
'timestamp': msg.timestamp.isoformat(),
'feedback': []
}
# 获取消息的反馈
for fb in Feedback.objects.filter(message_id=msg.id):
msg_data['feedback'].append({
'id': str(fb.id),
'type': 'basic',
'value': fb.feedback_value,
'user_id': str(fb.user_id),
'timestamp': fb.timestamp.isoformat()
})
# 获取详细反馈
for dfb in DetailedFeedback.objects.filter(message_id=msg.id):
try:
tags = json.loads(dfb.feedback_tags) if dfb.feedback_tags else []
except (json.JSONDecodeError, TypeError):
tags = [dfb.feedback_tags] if dfb.feedback_tags else []
msg_data['feedback'].append({
'id': str(dfb.id),
'type': 'detailed',
'feedback_type': dfb.feedback_type,
'tags': tags,
'custom_tags': dfb.custom_tags,
'custom_content': dfb.custom_content,
'is_inline': dfb.is_inline,
'user_id': str(dfb.user_id),
'timestamp': dfb.created_at.isoformat()
})
conv_data['messages'].append(msg_data)
data['conversations'].append(conv_data)
# 保存到文件
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
filename = f'rlhf_data_export_{timestamp}.json'
with open(filename, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
return filename