import pickle def clean_text(text): # 将文本反转 reversed_text = text[::-1] # 查找第一个句号的位置 dot_pos = reversed_text.find('.') if dot_pos == -1: # 如果没有找到句号 return text # 删除句号之前的所有文本,然后再次反转 cleaned_text = reversed_text[dot_pos:][::-1] return cleaned_text.strip() # 加载原始数据 print("正在加载数据...") with open('batch_1.pkl', 'rb') as f: data = pickle.load(f) # 处理文本 print("正在处理文本...") cleaned_data = [] for item in data: cleaned_item = item.copy() # 复制原始数据项 cleaned_item['response'] = clean_text(item['response']) cleaned_data.append(cleaned_item) # 保存处理后的数据 print("正在保存清理后的数据...") with open('batch_1_cleaned.pkl', 'wb') as f: pickle.dump(cleaned_data, f) # 打印示例 print("\n处理示例:") for i in range(min(3, len(data))): print(f"\n原始文本 {i+1}:") print(data[i]['response']) print(f"\n处理后文本 {i+1}:") print(cleaned_data[i]['response']) print(f"\n总数据量: {len(data)}") print("数据已保存到 batch_1_cleaned.pkl")