from datasets import load_dataset import requests from PIL import Image from io import BytesIO import pandas as pd ds = load_dataset("ckandemir/amazon-products") valid_data = [] processed_count = 0 # 用于追踪成功处理的数量 # 遍历数据集进行验证 for idx, item in enumerate(ds['train']): # 检查Product Name是否为空 if not item['Product Name'] or pd.isna(item['Product Name']): continue # 验证图片URL是否可以加载 try: response = requests.get(item['Image'], timeout=5) if response.status_code == 200: # 尝试将图片加载为PIL Image对象 image = Image.open(BytesIO(response.content)) # 如果成功加载,保存图片对象和产品名称 valid_data.append({ 'image': image, 'name': item['Product Name'] }) processed_count += 1 # 每成功处理100条数据输出一次信息 if processed_count % 100 == 0: print(f"已成功处理 {processed_count} 条数据,当前处理到第 {idx+1} 条") except Exception as e: continue print(f"\n处理完成!") print(f"原始数据数量: {len(ds['train'])}") print(f"清洗后的有效数据数量: {len(valid_data)}") # 保存处理后的数据 import pickle with open('valid_products.pkl', 'wb') as f: pickle.dump(valid_data, f)