from django.core.management.base import BaseCommand from rlhf.models import Conversation, Message, Feedback, DetailedFeedback, FeedbackTag from django.db.models import Count, Avg, Sum, Q, F from django.utils import timezone from django.contrib.auth import get_user_model import json from datetime import datetime, timedelta User = get_user_model() class Command(BaseCommand): help = '分析RLHF反馈数据,生成统计报告' def add_arguments(self, parser): parser.add_argument( '--export', action='store_true', help='导出数据到JSON文件', ) parser.add_argument( '--days', type=int, default=30, help='分析最近的天数', ) def handle(self, *args, **options): self.stdout.write(self.style.SUCCESS("=" * 60)) self.stdout.write(self.style.SUCCESS("🤖 在线人类反馈强化学习系统 - 数据分析报告")) self.stdout.write(self.style.SUCCESS("=" * 60)) # 基本统计 feedback_stats = self.get_feedback_stats() self.stdout.write(self.style.SUCCESS(f"\n📊 反馈统计:")) self.stdout.write(f" 总反馈数量: {feedback_stats['total_feedback']}") self.stdout.write(f" 正面反馈: {feedback_stats['positive_feedback']} ({feedback_stats['positive_rate']:.1f}%)") self.stdout.write(f" 负面反馈: {feedback_stats['negative_feedback']}") self.stdout.write(f" 平均反馈分数: {feedback_stats['avg_feedback']:.2f}") # 对话统计 conv_stats = self.get_conversation_stats() self.stdout.write(self.style.SUCCESS(f"\n💬 对话统计:")) self.stdout.write(f" 总对话数量: {conv_stats['total_conversations']}") self.stdout.write(f" 总消息数量: {conv_stats['total_messages']}") self.stdout.write(f" 平均每对话消息数: {conv_stats['avg_messages_per_conversation']:.1f}") # 标签统计 tag_stats = self.get_tag_stats() self.stdout.write(self.style.SUCCESS(f"\n🏷️ 标签统计:")) self.stdout.write(f" 最常用的正面标签:") for tag in tag_stats['top_positive']: self.stdout.write(f" - {tag['tag_name']}: {tag['count']}次") self.stdout.write(f" 最常用的负面标签:") for tag in tag_stats['top_negative']: self.stdout.write(f" - {tag['tag_name']}: {tag['count']}次") # 每日趋势 days = options['days'] daily_trend = self.get_daily_feedback_trend(days) self.stdout.write(self.style.SUCCESS(f"\n📈 最近{days}天反馈趋势:")) for day in daily_trend: self.stdout.write(f" {day['date']}: {day['total']}条反馈 (正面率: {day['positive_rate']:.1f}%)") # 用户统计 user_stats = self.get_user_stats() self.stdout.write(self.style.SUCCESS(f"\n👥 用户统计:")) self.stdout.write(f" 总用户数量: {user_stats['total_users']}") self.stdout.write(f" 活跃标注用户: {user_stats['active_users']}") self.stdout.write(f" 平均每用户标注量: {user_stats['avg_annotations_per_user']:.1f}") # 导出数据 if options['export']: filename = self.export_data_to_json() self.stdout.write(self.style.SUCCESS(f"\n✅ 数据已导出到: {filename}")) def get_feedback_stats(self): """获取反馈统计信息""" # 基本反馈统计 basic_feedback = Feedback.objects.aggregate( total=Count('id'), positive=Sum(Case(When(feedback_value__gt=0, then=1), default=0)), negative=Sum(Case(When(feedback_value__lt=0, then=1), default=0)), avg=Avg('feedback_value') ) # 详细反馈统计 detailed_feedback = DetailedFeedback.objects.aggregate( total=Count('id'), positive=Count('id', filter=Q(feedback_type='positive')), negative=Count('id', filter=Q(feedback_type='negative')) ) # 合并统计 total = (basic_feedback['total'] or 0) + (detailed_feedback['total'] or 0) positive = (basic_feedback['positive'] or 0) + (detailed_feedback['positive'] or 0) negative = (basic_feedback['negative'] or 0) + (detailed_feedback['negative'] or 0) # 计算平均分和正面比例 avg_feedback = basic_feedback['avg'] or 0 positive_rate = (positive / total * 100) if total > 0 else 0 return { 'total_feedback': total, 'positive_feedback': positive, 'negative_feedback': negative, 'avg_feedback': avg_feedback, 'positive_rate': positive_rate } def get_conversation_stats(self): """获取对话统计信息""" total_conversations = Conversation.objects.count() total_messages = Message.objects.count() # 计算每个对话的消息数量分布 conversation_messages = Message.objects.values('conversation').annotate(count=Count('id')) avg_messages = conversation_messages.aggregate(Avg('count'))['count__avg'] or 0 return { 'total_conversations': total_conversations, 'total_messages': total_messages, 'avg_messages_per_conversation': avg_messages } def get_tag_stats(self): """获取标签使用统计""" # 分析DetailedFeedback中的标签使用情况 # 注意:由于标签可能存储为JSON字符串,这里需要解析 # 首先获取所有的标签 all_tags = FeedbackTag.objects.all() tag_id_to_name = {str(tag.id): tag.tag_name for tag in all_tags} # 计算每个标签的使用次数 tag_counts = {} for feedback in DetailedFeedback.objects.all(): if feedback.feedback_tags: try: # 尝试解析JSON标签列表 tag_ids = json.loads(feedback.feedback_tags) if isinstance(tag_ids, list): for tag_id in tag_ids: tag_id = str(tag_id) if tag_id in tag_counts: tag_counts[tag_id] += 1 else: tag_counts[tag_id] = 1 except (json.JSONDecodeError, TypeError): # 如果不是有效的JSON,可能是单个标签ID tag_id = str(feedback.feedback_tags) if tag_id in tag_counts: tag_counts[tag_id] += 1 else: tag_counts[tag_id] = 1 # 获取排名前5的正面和负面标签 positive_tags = FeedbackTag.objects.filter(tag_type='positive') negative_tags = FeedbackTag.objects.filter(tag_type='negative') top_positive = [] for tag in positive_tags: tag_id = str(tag.id) if tag_id in tag_counts: top_positive.append({ 'tag_name': tag.tag_name, 'count': tag_counts[tag_id] }) top_negative = [] for tag in negative_tags: tag_id = str(tag.id) if tag_id in tag_counts: top_negative.append({ 'tag_name': tag.tag_name, 'count': tag_counts[tag_id] }) # 按使用次数排序 top_positive.sort(key=lambda x: x['count'], reverse=True) top_negative.sort(key=lambda x: x['count'], reverse=True) # 取前5 return { 'top_positive': top_positive[:5], 'top_negative': top_negative[:5] } def get_daily_feedback_trend(self, days=30): """获取每日反馈趋势""" # 计算开始日期 start_date = timezone.now().date() - timedelta(days=days) # 基本反馈按日期分组 basic_daily = Feedback.objects.filter(timestamp__date__gte=start_date) \ .values('timestamp__date') \ .annotate( date=F('timestamp__date'), total=Count('id'), positive=Sum(Case(When(feedback_value__gt=0, then=1), default=0)), negative=Sum(Case(When(feedback_value__lt=0, then=1), default=0)) ) \ .order_by('date') # 详细反馈按日期分组 detailed_daily = DetailedFeedback.objects.filter(created_at__date__gte=start_date) \ .values('created_at__date') \ .annotate( date=F('created_at__date'), total=Count('id'), positive=Count('id', filter=Q(feedback_type='positive')), negative=Count('id', filter=Q(feedback_type='negative')) ) \ .order_by('date') # 合并两种反馈数据 daily_data = {} for item in basic_daily: date_str = item['date'].strftime('%Y-%m-%d') daily_data[date_str] = { 'date': date_str, 'total': item['total'], 'positive': item['positive'], 'negative': item['negative'] } for item in detailed_daily: date_str = item['date'].strftime('%Y-%m-%d') if date_str in daily_data: daily_data[date_str]['total'] += item['total'] daily_data[date_str]['positive'] += item['positive'] daily_data[date_str]['negative'] += item['negative'] else: daily_data[date_str] = { 'date': date_str, 'total': item['total'], 'positive': item['positive'], 'negative': item['negative'] } # 计算正面反馈比例 for date_str, data in daily_data.items(): data['positive_rate'] = (data['positive'] / data['total'] * 100) if data['total'] > 0 else 0 # 转换为列表并按日期排序 result = list(daily_data.values()) result.sort(key=lambda x: x['date']) return result def get_user_stats(self): """获取用户统计信息""" # 总用户数 total_users = User.objects.count() # 有反馈记录的用户数 users_with_feedback = User.objects.filter( Q(feedback__isnull=False) | Q(detailed_feedback__isnull=False) ).distinct().count() # 最近30天活跃的标注用户 thirty_days_ago = timezone.now() - timedelta(days=30) active_users = User.objects.filter( Q(feedback__timestamp__gte=thirty_days_ago) | Q(detailed_feedback__created_at__gte=thirty_days_ago) ).distinct().count() # 计算每个用户的标注量 user_annotations = {} for feedback in Feedback.objects.all(): user_id = str(feedback.user_id) if user_id in user_annotations: user_annotations[user_id] += 1 else: user_annotations[user_id] = 1 for feedback in DetailedFeedback.objects.all(): user_id = str(feedback.user_id) if user_id in user_annotations: user_annotations[user_id] += 1 else: user_annotations[user_id] = 1 # 计算平均每用户标注量 if user_annotations: avg_annotations = sum(user_annotations.values()) / len(user_annotations) else: avg_annotations = 0 return { 'total_users': total_users, 'users_with_feedback': users_with_feedback, 'active_users': active_users, 'avg_annotations_per_user': avg_annotations } def export_data_to_json(self): """导出数据到JSON文件""" data = { 'conversations': [], 'feedback_summary': self.get_feedback_stats(), 'tag_stats': self.get_tag_stats(), 'daily_trend': self.get_daily_feedback_trend(30), 'export_time': timezone.now().isoformat() } # 导出对话和消息数据 for conv in Conversation.objects.all().prefetch_related('messages'): conv_data = { 'id': str(conv.id), 'created_at': conv.created_at.isoformat(), 'user_id': str(conv.user_id), 'is_submitted': conv.is_submitted, 'messages': [] } for msg in conv.messages.all().order_by('timestamp'): msg_data = { 'id': str(msg.id), 'role': msg.role, 'content': msg.content, 'timestamp': msg.timestamp.isoformat(), 'feedback': [] } # 获取消息的反馈 for fb in Feedback.objects.filter(message_id=msg.id): msg_data['feedback'].append({ 'id': str(fb.id), 'type': 'basic', 'value': fb.feedback_value, 'user_id': str(fb.user_id), 'timestamp': fb.timestamp.isoformat() }) # 获取详细反馈 for dfb in DetailedFeedback.objects.filter(message_id=msg.id): try: tags = json.loads(dfb.feedback_tags) if dfb.feedback_tags else [] except (json.JSONDecodeError, TypeError): tags = [dfb.feedback_tags] if dfb.feedback_tags else [] msg_data['feedback'].append({ 'id': str(dfb.id), 'type': 'detailed', 'feedback_type': dfb.feedback_type, 'tags': tags, 'custom_tags': dfb.custom_tags, 'custom_content': dfb.custom_content, 'is_inline': dfb.is_inline, 'user_id': str(dfb.user_id), 'timestamp': dfb.created_at.isoformat() }) conv_data['messages'].append(msg_data) data['conversations'].append(conv_data) # 保存到文件 timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') filename = f'rlhf_data_export_{timestamp}.json' with open(filename, 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=2) return filename