from jiexiExcel import write_data_to_file # 导入特定的函数 from checkBad import beginCheckBadFun # excel文件夹路径 from formatData import formatAlpaca # excel文件夹路径 from merge import merge_json_files # 合并所有为一个训练脚本 import json import asyncio from collections import OrderedDict # 清洗数据函数 def formatDataFun(): input_directory = '/Users/yushanghui/hongshantianping/git/dataTools/book/通用书籍/2024.8.16/' # 把excel文件转成json文件 并保存到指定路径 output_file = "/Users/yushanghui/hongshantianping/git/dataTools/book/json/ty2024.8.16.json" # 通过checkBad.py 函数进行过滤指定输出文件夹 out_file = '/Users/yushanghui/hongshantianping/git/dataTools/book/jsonOut/ty2024.8.16' alpaca_file = '/Users/yushanghui/hongshantianping/git/dataTools/book/alpaca/ty2024.8.16.json' write_data_to_file(input_directory, output_file) beginCheckBadFun(output_file, out_file) formatAlpaca(out_file, alpaca_file) # editFilter(alpaca_file) def remove_duplicates(input_file): with open(input_file, 'r', encoding='utf-8') as file: data = json.load(file) unique_data = OrderedDict() for entry in data: instruction = entry['instruction'] output = entry['output'] # 如果已经存在相同的instruction,比较时间,保留最新的 if instruction in unique_data: existing_output = unique_data[instruction]['output'] if len(output) > len(existing_output): unique_data[instruction] = entry else: unique_data[instruction] = entry # 返回去重后的数据 return list(unique_data.values()) def editFilter(alpaca_file): unique_data = remove_duplicates(alpaca_file) with open('./book/alpaca/new_diedaiEdit.json', 'w', encoding='utf-8') as file: json.dump(unique_data, file, ensure_ascii=False, indent=4) print(len(unique_data)) async def mergedFun(): input_folder1 = './book/alpaca/' # 输入文件夹路径 output_file1 = './book/merged.json' # 输出文件名 # 第一次执行 merge_json_files await merge_json_files(input_folder1, output_file1) if __name__ == "__main__": # asyncio.run(mergedFun()) formatDataFun()