45 lines
1.4 KiB
Python
45 lines
1.4 KiB
Python
|
from datasets import load_dataset
|
||
|
import requests
|
||
|
from PIL import Image
|
||
|
from io import BytesIO
|
||
|
import pandas as pd
|
||
|
|
||
|
ds = load_dataset("ckandemir/amazon-products")
|
||
|
valid_data = []
|
||
|
processed_count = 0 # 用于追踪成功处理的数量
|
||
|
|
||
|
# 遍历数据集进行验证
|
||
|
for idx, item in enumerate(ds['train']):
|
||
|
# 检查Product Name是否为空
|
||
|
if not item['Product Name'] or pd.isna(item['Product Name']):
|
||
|
continue
|
||
|
|
||
|
# 验证图片URL是否可以加载
|
||
|
try:
|
||
|
response = requests.get(item['Image'], timeout=5)
|
||
|
if response.status_code == 200:
|
||
|
# 尝试将图片加载为PIL Image对象
|
||
|
image = Image.open(BytesIO(response.content))
|
||
|
# 如果成功加载,保存图片对象和产品名称
|
||
|
valid_data.append({
|
||
|
'image': image,
|
||
|
'name': item['Product Name']
|
||
|
})
|
||
|
|
||
|
processed_count += 1
|
||
|
# 每成功处理100条数据输出一次信息
|
||
|
if processed_count % 100 == 0:
|
||
|
print(f"已成功处理 {processed_count} 条数据,当前处理到第 {idx+1} 条")
|
||
|
except Exception as e:
|
||
|
continue
|
||
|
|
||
|
print(f"\n处理完成!")
|
||
|
print(f"原始数据数量: {len(ds['train'])}")
|
||
|
print(f"清洗后的有效数据数量: {len(valid_data)}")
|
||
|
|
||
|
# 保存处理后的数据
|
||
|
import pickle
|
||
|
with open('valid_products.pkl', 'wb') as f:
|
||
|
pickle.dump(valid_data, f)
|
||
|
|