commit 655260aabf8b278beda5a3e028e52cb8d6573879 Author: Zixiao Wang Date: Thu Feb 13 17:15:57 2025 +0800 first commit diff --git a/predata.py b/predata.py new file mode 100644 index 0000000..8e063ac --- /dev/null +++ b/predata.py @@ -0,0 +1,202 @@ +import torch +import torchvision.transforms as T +from PIL import Image +from torchvision.transforms.functional import InterpolationMode +import requests +from io import BytesIO +from urllib.parse import urlparse + +IMAGENET_MEAN = (0.485, 0.456, 0.406) +IMAGENET_STD = (0.229, 0.224, 0.225) + +def build_transform(input_size): + transform = T.Compose([ + T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img), + T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC), + T.ToTensor(), + T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD) + ]) + return transform + +def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size): + best_ratio_diff = float('inf') + best_ratio = (1, 1) + area = width * height + for ratio in target_ratios: + target_aspect_ratio = ratio[0] / ratio[1] + ratio_diff = abs(aspect_ratio - target_aspect_ratio) + if ratio_diff < best_ratio_diff: + best_ratio_diff = ratio_diff + best_ratio = ratio + elif ratio_diff == best_ratio_diff: + if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]: + best_ratio = ratio + return best_ratio + +def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False): + orig_width, orig_height = image.size + aspect_ratio = orig_width / orig_height + + target_ratios = set( + (i, j) for n in range(min_num, max_num + 1) + for i in range(1, n + 1) + for j in range(1, n + 1) + if i * j <= max_num and i * j >= min_num + ) + target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1]) + + target_aspect_ratio = find_closest_aspect_ratio( + aspect_ratio, target_ratios, orig_width, orig_height, image_size) + + target_width = image_size * target_aspect_ratio[0] + target_height = image_size * target_aspect_ratio[1] + blocks = target_aspect_ratio[0] * target_aspect_ratio[1] + + resized_img = image.resize((target_width, target_height)) + processed_images = [] + for i in range(blocks): + box = ( + (i % (target_width // image_size)) * image_size, + (i // (target_width // image_size)) * image_size, + ((i % (target_width // image_size)) + 1) * image_size, + ((i // (target_width // image_size)) + 1) * image_size + ) + split_img = resized_img.crop(box) + processed_images.append(split_img) + + assert len(processed_images) == blocks + if use_thumbnail and len(processed_images) != 1: + thumbnail_img = image.resize((image_size, image_size)) + processed_images.append(thumbnail_img) + return processed_images + +def load_image(image_file, input_size=448, max_num=12): + # 处理不同类型的输入 + if isinstance(image_file, Image.Image): + image = image_file.convert('RGB') + elif isinstance(image_file, str) and bool(urlparse(image_file).netloc): + try: + response = requests.get(image_file, timeout=10) + response.raise_for_status() + image = Image.open(BytesIO(response.content)).convert('RGB') + except Exception as e: + raise ValueError(f"无法从URL加载图片: {str(e)}") + elif isinstance(image_file, str): + image = Image.open(image_file).convert('RGB') + elif isinstance(image_file, bytes): + image = Image.open(BytesIO(image_file)).convert('RGB') + else: + raise ValueError(f"不支持的图片格式: {type(image_file)}") + + transform = build_transform(input_size=input_size) + images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num) + pixel_values = [transform(image) for image in images] + pixel_values = torch.stack(pixel_values) + return pixel_values + +def prepare_training_data(question: str, answer: str, tokenizer, image_path=None, input_size=448, max_num=12, num_image_token: int = 256): + """ + 准备完整的训练数据(包括输入和标签) + + Args: + question (str): 用户输入的问题 + answer (str): 助手的回答文本 + tokenizer: 分词器 + image_path: 图片路径、URL或PIL Image对象 + input_size (int): 图片输入尺寸 + max_num (int): 最大图片块数 + num_image_token (int): 每张图片的token数量 + + Returns: + dict: 包含模型训练所需的所有输入 + """ + # 1. 处理图像输入 + pixel_values = None + if image_path is not None: + pixel_values = load_image(image_path, input_size=input_size, max_num=max_num)[-1:] ### 只取最后一张图片 + if torch.cuda.is_available(): + pixel_values = pixel_values.to(torch.bfloat16) + + # 2. 确保问题包含图片标记 + if pixel_values is not None and '' not in question: + question = '\n' + question + + # 3. 根据pixel_values确定num_patches + num_patches = pixel_values.shape[0] if pixel_values is not None else 0 + + # 4. 构造完整的对话内容 + system_msg = "你是书生·万象,英文名是InternVL,是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。" + + # 5. 替换图片标记 + if num_patches > 0: + image_tokens = "" + "" * (num_image_token * num_patches) + "" + question = question.replace('', image_tokens, 1) + + # 6. 构造完整的query(包含答案) + full_prompt = ( + f"<|im_start|>system\n{system_msg}<|im_end|>\n" + f"<|im_start|>user\n{question}<|im_end|>\n" + f"<|im_start|>assistant\n{answer}<|im_end|>\n" + ) + + # 7. 转换为模型输入格式 + model_inputs = tokenizer( + full_prompt, + return_tensors="pt", + add_special_tokens=False + ) + + # 8. 构造labels + input_ids = model_inputs["input_ids"] + labels = input_ids.clone() + + assistant_start_token = "<|im_start|>assistant\n" + assistant_token_ids = tokenizer(assistant_start_token, add_special_tokens=False)["input_ids"] + + assistant_start_pos = None + for i in range(len(input_ids[0]) - len(assistant_token_ids)): + if input_ids[0][i:i+len(assistant_token_ids)].tolist() == assistant_token_ids: + assistant_start_pos = i + break + + if assistant_start_pos is not None: + labels[0, :assistant_start_pos] = -100 + + return { + "input_ids": input_ids, + "attention_mask": model_inputs["attention_mask"], + "labels": labels, + "pixel_values": pixel_values + } + +# 使用示例: +""" +import torch +from transformers import AutoTokenizer + +# 初始化tokenizer +tokenizer = AutoTokenizer.from_pretrained("path_to_tokenizer") + +# 准备示例数据 +question = "请描述这张图片中的内容" +answer = "这是一张美丽的风景照,画面中有青山绿水。" +image_path = "./examples/image1.jpg" # 或URL,或PIL Image对象 + +# 准备训练数据 +training_data = prepare_training_data( + question=question, + answer=answer, + tokenizer=tokenizer, + image_path=image_path, + input_size=448, + max_num=12 +) + +# training_data 包含: +# { +# "input_ids": tensor([[...]]), # 完整对话的token ids +# "attention_mask": tensor([[...]]), # 注意力掩码 +# "labels": tensor([[...]]), # 带有-100标记的标签 +# "pixel_values": tensor([[...]]) # 图像数据 +# } +""" \ No newline at end of file diff --git a/totrain.py b/totrain.py new file mode 100644 index 0000000..152dd62 --- /dev/null +++ b/totrain.py @@ -0,0 +1,44 @@ +from datasets import load_dataset +import requests +from PIL import Image +from io import BytesIO +import pandas as pd + +ds = load_dataset("ckandemir/amazon-products") +valid_data = [] +processed_count = 0 # 用于追踪成功处理的数量 + +# 遍历数据集进行验证 +for idx, item in enumerate(ds['train']): + # 检查Product Name是否为空 + if not item['Product Name'] or pd.isna(item['Product Name']): + continue + + # 验证图片URL是否可以加载 + try: + response = requests.get(item['Image'], timeout=5) + if response.status_code == 200: + # 尝试将图片加载为PIL Image对象 + image = Image.open(BytesIO(response.content)) + # 如果成功加载,保存图片对象和产品名称 + valid_data.append({ + 'image': image, + 'name': item['Product Name'] + }) + + processed_count += 1 + # 每成功处理100条数据输出一次信息 + if processed_count % 100 == 0: + print(f"已成功处理 {processed_count} 条数据,当前处理到第 {idx+1} 条") + except Exception as e: + continue + +print(f"\n处理完成!") +print(f"原始数据数量: {len(ds['train'])}") +print(f"清洗后的有效数据数量: {len(valid_data)}") + +# 保存处理后的数据 +import pickle +with open('valid_products.pkl', 'wb') as f: + pickle.dump(valid_data, f) + diff --git a/zhtrain.py b/zhtrain.py new file mode 100644 index 0000000..9f7056f --- /dev/null +++ b/zhtrain.py @@ -0,0 +1,234 @@ +import os +os.environ['CUDA_VISIBLE_DEVICES'] = '2,3' + +import pickle +from PIL import Image +import torch +from torch.utils.data import Dataset, DataLoader +from predata import prepare_training_data +from transformers import AutoTokenizer, AutoModel +import torch.nn as nn +from torch.optim import AdamW + +# 加载tokenizer +tokenizer = AutoTokenizer.from_pretrained("Internvl2_5") + +# 加载处理好的数据 +print("正在加载数据...") +with open('valid_products.pkl', 'rb') as f: + data = pickle.load(f) + +print(f"成功加载数据,共 {len(data)} 条记录") + +class ProductDataset(Dataset): + def __init__(self, data, tokenizer): + self.data = data + self.tokenizer = tokenizer + + def __len__(self): + return len(self.data) + + def __getitem__(self, idx): + item = self.data[idx] + question = "Tell me the product name in the picture." + answer = "The product name is: " + item['name'] + + # 直接使用prepare_training_data处理所有数据 + training_data = prepare_training_data( + question=question, + answer=answer, + tokenizer=self.tokenizer, + image_path=item['image'] # 直接传入PIL Image对象 + ) + + return training_data + +# 创建数据集实例 +dataset = ProductDataset(data, tokenizer) + +# 测试输出第一条数据看看 +sample = dataset[0] +print("\n数据样例:") +print(f"输入形状: {sample['input_ids'].shape}") +print(f"图片张量形状: {sample['pixel_values'].shape}") +print(f"标签形状: {sample['labels'].shape}") + +# 设置设备 +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +# 加载模型 +path = 'Internvl2_5' +model = AutoModel.from_pretrained( + path, + torch_dtype=torch.bfloat16, + low_cpu_mem_usage=True, + use_flash_attn=True, + trust_remote_code=True, + vision_model = None, + language_model = None).train().cuda() +tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False) + +# 加载之前训练的权重 +print("正在加载预训练权重 model_epoch_5.pth ...") +model.load_state_dict(torch.load('model_epoch_5.pth')) +print("成功加载预训练权重") + +def prepare_model_for_training(model, cast_trainable_params_to_fp32=True): + print("正在设置模型参数...") + + # 冻结vision_model + for param in model.vision_model.parameters(): + param.requires_grad_(False) + + # 冻结language_model + for param in model.language_model.parameters(): + param.requires_grad_(False) + + # 设置mlp1为可训练,并可选转换为fp32 + # for param in model.mlp1.parameters(): + # if cast_trainable_params_to_fp32: + # param.data = param.data.to(torch.float32) + + # 打印可训练参数数量 + trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) + print(f"可训练参数数量: {trainable_params}") + + return model + +# 准备模型 +model = prepare_model_for_training(model) + +# 只对可训练参数创建优化器 +optimizer = AdamW( + [p for p in model.parameters() if p.requires_grad], + lr=1e-4, # 学习率 + betas=(0.9, 0.999), + eps=1e-8, + weight_decay=0.01 +) + +def custom_collate_fn(batch): + """ + 自定义collate函数来处理变长序列 + """ + # 获取batch中最大的序列长度 + max_len = max([b['input_ids'].size(1) for b in batch]) + + batch_size = len(batch) + + # 创建填充后的张量 + input_ids = torch.zeros((batch_size, max_len), dtype=batch[0]['input_ids'].dtype) + attention_mask = torch.zeros((batch_size, max_len), dtype=batch[0]['attention_mask'].dtype) + labels = torch.full((batch_size, max_len), -100, dtype=batch[0]['labels'].dtype) # 用-100填充标签 + + # 填充每个样本 + for i, item in enumerate(batch): + seq_len = item['input_ids'].size(1) + input_ids[i, :seq_len] = item['input_ids'][0, :seq_len] + attention_mask[i, :seq_len] = item['attention_mask'][0, :seq_len] + labels[i, :seq_len] = item['labels'][0, :seq_len] + + # 处理pixel_values (这个应该是固定大小的) + pixel_values = torch.stack([item['pixel_values'] for item in batch]) + + return { + 'input_ids': input_ids, + 'attention_mask': attention_mask, + 'labels': labels, + 'pixel_values': pixel_values + } + +# 使用自定义的collate_fn创建DataLoader +train_loader = DataLoader( + dataset, + batch_size=5, # 可以根据需要调整 + shuffle=True, + pin_memory=torch.cuda.is_available(), + collate_fn=custom_collate_fn # 使用自定义的collate函数 +) + +# 计算img_context_token_id +IMG_CONTEXT_TOKEN = '' +img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN) + +def extract_feature(model, pixel_values): + # 移除多余的维度 + if pixel_values.dim() == 5: # [batch_size, 1, 3, 448, 448] + pixel_values = pixel_values.squeeze(1) # 变成 [batch_size, 3, 448, 448] + + # 使用vision_model提取特征 + vit_embeds = model.vision_model( + pixel_values=pixel_values, + output_hidden_states=False, + return_dict=True + ).last_hidden_state + + vit_embeds = vit_embeds[:, 1:, :] # 移除CLS token + + # 重塑并处理特征 + h = w = int(vit_embeds.shape[1] ** 0.5) + vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1) + vit_embeds = model.pixel_shuffle(vit_embeds, scale_factor=model.downsample_ratio) + vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1, vit_embeds.shape[-1]) + vit_embeds = model.mlp1(vit_embeds) + return vit_embeds + +# 训练循环 +num_epochs = 40 # 修改为40个epochs +for epoch in range(num_epochs): + model.train() + total_loss = 0 + for batch_idx, batch in enumerate(train_loader): + # 将数据移到GPU + input_ids = batch['input_ids'].to(device) + attention_mask = batch['attention_mask'].to(device) + labels = batch['labels'].to(device) + pixel_values = batch['pixel_values'].to(device) + + # 提取图像特征 + vit_embeds = extract_feature(model, pixel_values) + + # 计算输入嵌入 + input_embeds = model.language_model.get_input_embeddings()(input_ids) + B, N, C = input_embeds.shape + input_embeds = input_embeds.reshape(B * N, C) + + # 替换图像上下文token的嵌入 + input_ids_flat = input_ids.reshape(B * N) + selected = (input_ids_flat == img_context_token_id) + input_embeds[selected] = vit_embeds.reshape(-1, C).to(input_embeds.device) + + # 恢复原始形状 + input_embeds = input_embeds.reshape(B, N, C) + + # 前向传播 + outputs = model.language_model( + inputs_embeds=input_embeds, + attention_mask=attention_mask, + labels=labels, + return_dict=True + ) + + loss = outputs.loss + total_loss += loss.item() + + # 反向传播 + loss.backward() + optimizer.step() + optimizer.zero_grad() + + # 每100步打印一次loss + if batch_idx % 100 == 0: + print(f'Epoch: {epoch}, Batch: {batch_idx}, Loss: {loss.item():.4f}') + + # 每个epoch结束打印平均loss + avg_loss = total_loss / len(train_loader) + print(f'Epoch {epoch} completed. Average Loss: {avg_loss:.4f}') + + # 每10个epoch保存一次模型 + if (epoch + 1) % 10 == 0: + save_path = f'model_epoch_{epoch+1}.pth' + print(f"保存模型权重到 {save_path}") + torch.save(model.state_dict(), save_path) + +