递归挖掘TikTok网络中的相关用户和热门内容

This commit is contained in:
wanjia 2025-03-05 19:57:43 +08:00
parent 4ccfc764f7
commit ab858bcfa8
24 changed files with 251 additions and 0 deletions

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -17,4 +17,5 @@ urlpatterns = [
path('directory-status/', views.get_directory_status, name='directory_status'), path('directory-status/', views.get_directory_status, name='directory_status'),
path('tiktok/user-videos/', views.get_tiktok_user_videos, name='get-tiktok-user-videos'), path('tiktok/user-videos/', views.get_tiktok_user_videos, name='get-tiktok-user-videos'),
path('api/tiktok/fetch_videos/', views.fetch_tiktok_videos, name='fetch_tiktok_videos'), path('api/tiktok/fetch_videos/', views.fetch_tiktok_videos, name='fetch_tiktok_videos'),
path('api/recursive_fetch_videos', views.recursive_fetch_videos, name='recursive_fetch_videos'),
] ]

View File

@ -1359,3 +1359,253 @@ def download_video(video_id, unique_id, save_path):
import traceback import traceback
logger.error(f"详细错误: {traceback.format_exc()}") logger.error(f"详细错误: {traceback.format_exc()}")
return False return False
def fetch_user_followings(sec_uid):
"""获取用户关注列表"""
url = f"{API_BASE_URL}/api/tiktok/web/fetch_user_follow?secUid={sec_uid}"
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
}
response = requests.get(url, headers=headers, timeout=30)
if response.status_code == 200:
data = response.json()
logger.info(f"成功获取用户关注列表,共 {len(data['data'].get('userList', []))} 个关注")
return data
else:
logger.error(f"获取用户关注列表失败: {response.status_code}")
return None
except Exception as e:
logger.error(f"获取用户关注列表异常: {e}")
return None
def filter_users_by_followers(user_list, min_followers=5000, max_followers=50000):
"""筛选粉丝数在指定范围内的用户"""
filtered_users = []
for user_data in user_list:
try:
follower_count = user_data.get('stats', {}).get('followerCount', 0)
if min_followers <= follower_count <= max_followers:
filtered_users.append(user_data)
except Exception as e:
logger.error(f"筛选用户时出错: {e}")
return filtered_users
@csrf_exempt
@require_http_methods(["POST"])
def recursive_fetch_videos(request):
"""递归获取关注列表中的用户视频"""
try:
data = json.loads(request.body)
start_unique_id = data.get('unique_id')
max_depth = int(data.get('max_depth', 3)) # 默认递归深度为3
if not start_unique_id:
return JsonResponse({
'status': 'error',
'message': '请提供起始TikTok用户ID(unique_id)'
}, json_dumps_params={'ensure_ascii': False})
# 获取起始用户资料和secUid
user_profile = fetch_user_profile(start_unique_id)
if not user_profile or 'data' not in user_profile:
return JsonResponse({
'status': 'error',
'message': f'无法获取用户 {start_unique_id} 的资料'
}, json_dumps_params={'ensure_ascii': False})
# 提取secUid和其他用户信息
try:
user_info = user_profile['data']['userInfo']['user']
start_sec_uid = user_info['secUid']
# 提取起始用户的详细信息
nickname = user_info.get('nickname', '')
signature = user_info.get('signature', '')
avatar_url = user_info.get('avatarLarger', '')
# 提取统计信息
stats = user_profile['data']['userInfo'].get('stats', {})
follower_count = stats.get('followerCount', 0)
following_count = stats.get('followingCount', 0)
heart_count = stats.get('heartCount', 0) or stats.get('diggCount', 0)
video_count = stats.get('videoCount', 0)
# 为起始用户创建目录
start_user_dir = os.path.join(TIKTOK_VIDEOS_PATH, start_unique_id)
os.makedirs(start_user_dir, exist_ok=True)
# 保存起始用户信息到数据库
TiktokUserVideos.objects.update_or_create(
sec_user_id=start_sec_uid,
defaults={
'nickname': nickname,
'signature': signature,
'follower_count': follower_count,
'following_count': following_count,
'total_favorited': heart_count,
'video_count': video_count,
'avatar_url': avatar_url,
'videos_folder': start_user_dir
}
)
logger.info(f"成功获取并保存起始用户信息: {start_unique_id}, secUid: {start_sec_uid}")
except (KeyError, TypeError) as e:
logger.error(f"解析用户资料出错: {e}")
return JsonResponse({
'status': 'error',
'message': f'解析用户资料出错: {str(e)}'
}, json_dumps_params={'ensure_ascii': False})
# 开始递归获取视频
all_downloaded_videos = []
processed_users = set() # 已处理的用户集合,避免重复处理
def process_user(sec_uid, unique_id, depth=0):
"""递归处理用户,获取视频和关注用户"""
if depth >= max_depth or sec_uid in processed_users:
return
processed_users.add(sec_uid)
logger.info(f"处理用户 {unique_id},递归深度: {depth}")
# 确保用户目录存在
user_dir = os.path.join(TIKTOK_VIDEOS_PATH, unique_id)
os.makedirs(user_dir, exist_ok=True)
# 下载该用户的热门视频
videos_data = fetch_user_videos(sec_uid)
all_videos = []
if videos_data and isinstance(videos_data, dict) and 'data' in videos_data and 'itemList' in videos_data['data']:
for video in videos_data['data']['itemList']:
try:
video_id = video.get('id', '')
if not video_id or not str(video_id).isdigit():
continue
stats = video.get('stats', {})
if not isinstance(stats, dict):
stats = {}
play_count = int(stats.get('playCount', 0))
all_videos.append({
'id': video_id,
'desc': video.get('desc', ''),
'play_count': play_count
})
except Exception as e:
logger.error(f"处理视频数据出错: {str(e)}")
continue
# 按播放量排序并获取前10个
all_videos.sort(key=lambda x: x['play_count'], reverse=True)
top_videos = all_videos[:10]
# 下载视频
downloaded_videos = []
for i, video in enumerate(top_videos):
video_id = video['id']
save_path = os.path.join(user_dir, f"{video_id}.mp4")
logger.info(f"下载用户 {unique_id} 的第 {i+1} 个热门视频: {video_id}")
if download_video(video_id, unique_id, save_path):
video['download_path'] = save_path
video['user_unique_id'] = unique_id
downloaded_videos.append(video)
all_downloaded_videos.append(video)
time.sleep(1) # 避免频繁请求
# 保存用户信息到数据库
video_info_json = json.dumps([{
'id': v['id'],
'desc': v['desc'],
'play_count': v['play_count']
} for v in downloaded_videos], ensure_ascii=False)
TiktokUserVideos.objects.update_or_create(
sec_user_id=sec_uid,
defaults={
'nickname': unique_id,
'videos_folder': user_dir,
'video_paths': video_info_json
}
)
# 获取关注列表
followings_data = fetch_user_followings(sec_uid)
if followings_data and 'data' in followings_data and 'userList' in followings_data['data']:
user_list = followings_data['data']['userList']
# 筛选粉丝数在5000-50000之间的用户
filtered_users = filter_users_by_followers(user_list, 5000, 50000)
logger.info(f"用户 {unique_id} 的关注列表中有 {len(filtered_users)} 个粉丝数在5000-50000之间")
# 取前5个用户
for user_data in filtered_users[:5]:
try:
# 直接从关注列表中提取用户信息
user_obj = user_data['user']
following_sec_uid = user_obj['secUid']
following_unique_id = user_obj['uniqueId']
# 获取用户详细信息
nickname = user_obj.get('nickname', '')
signature = user_obj.get('signature', '')
avatar_url = user_obj.get('avatarLarger', '')
# 获取统计信息
stats = user_data.get('stats', {})
follower_count = stats.get('followerCount', 0)
following_count = stats.get('followingCount', 0)
heart_count = stats.get('heartCount', 0)
video_count = stats.get('videoCount', 0)
# 保存用户信息到数据库(即使尚未下载视频)
follow_user_dir = os.path.join(TIKTOK_VIDEOS_PATH, following_unique_id)
TiktokUserVideos.objects.update_or_create(
sec_user_id=following_sec_uid,
defaults={
'nickname': nickname,
'signature': signature,
'follower_count': follower_count,
'following_count': following_count,
'total_favorited': heart_count,
'video_count': video_count,
'avatar_url': avatar_url,
'videos_folder': follow_user_dir
}
)
# 递归处理关注的用户
process_user(following_sec_uid, following_unique_id, depth + 1)
except Exception as e:
logger.error(f"处理关注用户时出错: {e}")
continue
# 开始递归处理
process_user(start_sec_uid, start_unique_id)
return JsonResponse({
'status': 'success',
'message': '递归获取视频完成',
'processed_users_count': len(processed_users),
'downloaded_videos_count': len(all_downloaded_videos),
'downloaded_videos': [{'id': v['id'], 'desc': v['desc'][:50], 'play_count': v['play_count'], 'user': v['user_unique_id']} for v in all_downloaded_videos[:100]] # 只返回前100个视频信息避免响应过大
}, json_dumps_params={'ensure_ascii': False})
except Exception as e:
logger.error(f"递归获取TikTok视频失败: {e}")
import traceback
logger.error(f"详细错误: {traceback.format_exc()}")
return JsonResponse({
'status': 'error',
'message': f'递归获取TikTok视频失败: {str(e)}'
}, json_dumps_params={'ensure_ascii': False})