import os import json import random import asyncio import aiofiles import checkBad async def merge_json_files(input_folder, output_file, output_test_file=None): all_data = [] # 遍历输入文件夹中的所有文件 for filename in os.listdir(input_folder): if filename.endswith('.json'): file_path = os.path.join(input_folder, filename) async with aiofiles.open(file_path, 'r', encoding='utf-8') as f: data = json.loads(await f.read()) all_data.extend(data) # 随机化数据顺序 random.shuffle(all_data) sample_data = [] if output_test_file: # 抽取5000条数据或数据量不足时抽取所有数据 sample_data = random.sample(all_data, min(5000, len(all_data))) print(f'all_data长度: {len(all_data)} , sample_data长度: {len(sample_data)}') # 将所有数据写入到输出文件中 async with aiofiles.open(output_file, 'w', encoding='utf-8') as f_out: await f_out.write(json.dumps(all_data, ensure_ascii=False, indent=2)) if sample_data: async with aiofiles.open(output_test_file, 'w', encoding='utf-8') as f_out: await f_out.write(json.dumps(sample_data, ensure_ascii=False, indent=2)) # 生成测试集 async def main(): # 设置输入文件夹和输出文件名 input_folder1 = './book/alpaca' # 输入文件夹路径 output_file1 = './book/all/merged.json' # 输出文件名 output_test_file1 = './book/all/test/merged_test.json' # 测试输出文件名 # 第一次执行 merge_json_files await merge_json_files(input_folder1, output_file1, output_test_file1) # 第二次执行的输入文件夹和输出文件名 input_folder2 = './book/all/test/' # 输入文件夹路径 output_file2 = './book/all/shuiwuTest.json' # 输出文件名 # 第二次执行 merge_json_files await merge_json_files(input_folder2, output_file2) # 通过迭代修改的样本去除旧的错误样本 async def checkLoop(input_folder, check_file): all_data = [] loop_lines = [] all_wen = set() testall_wen = [] # 读取 check_file 并解析 JSON 字符串 async with aiofiles.open(check_file, 'r', encoding='utf-8') as f_in: # async for item in f_in: # try: # json_obj = json.loads(item) # except json.JSONDecodeError: # continue # # first_element = json_obj[0] if json_obj else None # if first_element: # all_wen.add(first_element) # check数据文件如果是一个json集合 # 读取文件内容 file_content = await f_in.read() # 将读取的字符串内容转换为JSON对象 all_wen = json.loads(file_content) print(f"修改数据 {len(all_wen)}") # 遍历输入文件夹中的所有 JSON 文件 for filename in os.listdir(input_folder): if filename.endswith('.json'): file_path = os.path.join(input_folder, filename) async with aiofiles.open(file_path, 'r', encoding='utf-8') as f: try: dataList = json.loads(await f.read()) except json.JSONDecodeError: continue for item in dataList: testall_wen.append(item) instruction = item.get('instruction', None) if instruction in all_wen: loop_lines.append(item) continue all_data.append(item) print(f"总数据 {len(all_data)},所有的数据{len(testall_wen)}") print(f"重复数据 {len(loop_lines)}") result = { "all_book": all_data, "loop": loop_lines, } return result #执行过滤重复函数 def to_filter_loop(): input_folder = './book/all_book/' check_file = './book/过滤/queries.json' result_data = asyncio.run(checkLoop(input_folder,check_file)) checkBad.write_files("./book/filter", result_data,True) if __name__ == "__main__": # 运行主函数 # asyncio.run(main()) to_filter_loop()