1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859 |
- from jiexiExcel import write_data_to_file # 导入特定的函数
- from checkBad import beginCheckBadFun # excel文件夹路径
- from formatData import formatAlpaca # excel文件夹路径
- from merge import merge_json_files # 合并所有为一个训练脚本
- import json
- import asyncio
- from collections import OrderedDict
- # 清洗数据函数
- def formatDataFun():
- input_directory = '/Users/yushanghui/hongshantianping/git/dataTools/book/通用书籍/2024.8.16/'
- # 把excel文件转成json文件 并保存到指定路径
- output_file = "/Users/yushanghui/hongshantianping/git/dataTools/book/json/ty2024.8.16.json"
- # 通过checkBad.py 函数进行过滤指定输出文件夹
- out_file = '/Users/yushanghui/hongshantianping/git/dataTools/book/jsonOut/ty2024.8.16'
- alpaca_file = '/Users/yushanghui/hongshantianping/git/dataTools/book/alpaca/ty2024.8.16.json'
- write_data_to_file(input_directory, output_file)
- beginCheckBadFun(output_file, out_file)
- formatAlpaca(out_file, alpaca_file)
- # editFilter(alpaca_file)
- def remove_duplicates(input_file):
- with open(input_file, 'r', encoding='utf-8') as file:
- data = json.load(file)
- unique_data = OrderedDict()
- for entry in data:
- instruction = entry['instruction']
- output = entry['output']
- # 如果已经存在相同的instruction,比较时间,保留最新的
- if instruction in unique_data:
- existing_output = unique_data[instruction]['output']
- if len(output) > len(existing_output):
- unique_data[instruction] = entry
- else:
- unique_data[instruction] = entry
- # 返回去重后的数据
- return list(unique_data.values())
- def editFilter(alpaca_file):
- unique_data = remove_duplicates(alpaca_file)
- with open('./book/alpaca/new_diedaiEdit.json', 'w', encoding='utf-8') as file:
- json.dump(unique_data, file, ensure_ascii=False, indent=4)
- print(len(unique_data))
- async def mergedFun():
- input_folder1 = './book/alpaca/' # 输入文件夹路径
- output_file1 = './book/merged.json' # 输出文件名
- # 第一次执行 merge_json_files
- await merge_json_files(input_folder1, output_file1)
- if __name__ == "__main__":
- # asyncio.run(mergedFun())
- formatDataFun()
|