Fortrain/clean_text.py

43 lines
1.2 KiB
Python
Raw Permalink Normal View History

import pickle
def clean_text(text):
# 将文本反转
reversed_text = text[::-1]
# 查找第一个句号的位置
dot_pos = reversed_text.find('.')
if dot_pos == -1: # 如果没有找到句号
return text
# 删除句号之前的所有文本,然后再次反转
cleaned_text = reversed_text[dot_pos:][::-1]
return cleaned_text.strip()
# 加载原始数据
print("正在加载数据...")
with open('batch_1.pkl', 'rb') as f:
data = pickle.load(f)
# 处理文本
print("正在处理文本...")
cleaned_data = []
for item in data:
cleaned_item = item.copy() # 复制原始数据项
cleaned_item['response'] = clean_text(item['response'])
cleaned_data.append(cleaned_item)
# 保存处理后的数据
print("正在保存清理后的数据...")
with open('batch_1_cleaned.pkl', 'wb') as f:
pickle.dump(cleaned_data, f)
# 打印示例
print("\n处理示例:")
for i in range(min(3, len(data))):
print(f"\n原始文本 {i+1}:")
print(data[i]['response'])
print(f"\n处理后文本 {i+1}:")
print(cleaned_data[i]['response'])
print(f"\n总数据量: {len(data)}")
print("数据已保存到 batch_1_cleaned.pkl")