43 lines
1.2 KiB
Python
43 lines
1.2 KiB
Python
import pickle
|
|
|
|
def clean_text(text):
|
|
# 将文本反转
|
|
reversed_text = text[::-1]
|
|
# 查找第一个句号的位置
|
|
dot_pos = reversed_text.find('.')
|
|
|
|
if dot_pos == -1: # 如果没有找到句号
|
|
return text
|
|
|
|
# 删除句号之前的所有文本,然后再次反转
|
|
cleaned_text = reversed_text[dot_pos:][::-1]
|
|
return cleaned_text.strip()
|
|
|
|
# 加载原始数据
|
|
print("正在加载数据...")
|
|
with open('batch_1.pkl', 'rb') as f:
|
|
data = pickle.load(f)
|
|
|
|
# 处理文本
|
|
print("正在处理文本...")
|
|
cleaned_data = []
|
|
for item in data:
|
|
cleaned_item = item.copy() # 复制原始数据项
|
|
cleaned_item['response'] = clean_text(item['response'])
|
|
cleaned_data.append(cleaned_item)
|
|
|
|
# 保存处理后的数据
|
|
print("正在保存清理后的数据...")
|
|
with open('batch_1_cleaned.pkl', 'wb') as f:
|
|
pickle.dump(cleaned_data, f)
|
|
|
|
# 打印示例
|
|
print("\n处理示例:")
|
|
for i in range(min(3, len(data))):
|
|
print(f"\n原始文本 {i+1}:")
|
|
print(data[i]['response'])
|
|
print(f"\n处理后文本 {i+1}:")
|
|
print(cleaned_data[i]['response'])
|
|
|
|
print(f"\n总数据量: {len(data)}")
|
|
print("数据已保存到 batch_1_cleaned.pkl") |