merge.py 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121
  1. import os
  2. import json
  3. import random
  4. import asyncio
  5. import aiofiles
  6. import checkBad
  7. async def merge_json_files(input_folder, output_file, output_test_file=None):
  8. all_data = []
  9. # 遍历输入文件夹中的所有文件
  10. for filename in os.listdir(input_folder):
  11. if filename.endswith('.json'):
  12. file_path = os.path.join(input_folder, filename)
  13. async with aiofiles.open(file_path, 'r', encoding='utf-8') as f:
  14. data = json.loads(await f.read())
  15. all_data.extend(data)
  16. # 随机化数据顺序
  17. random.shuffle(all_data)
  18. sample_data = []
  19. if output_test_file:
  20. # 抽取5000条数据或数据量不足时抽取所有数据
  21. sample_data = random.sample(all_data, min(5000, len(all_data)))
  22. print(f'all_data长度: {len(all_data)} , sample_data长度: {len(sample_data)}')
  23. # 将所有数据写入到输出文件中
  24. async with aiofiles.open(output_file, 'w', encoding='utf-8') as f_out:
  25. await f_out.write(json.dumps(all_data, ensure_ascii=False, indent=2))
  26. if sample_data:
  27. async with aiofiles.open(output_test_file, 'w', encoding='utf-8') as f_out:
  28. await f_out.write(json.dumps(sample_data, ensure_ascii=False, indent=2))
  29. # 生成测试集
  30. async def main():
  31. # 设置输入文件夹和输出文件名
  32. input_folder1 = './book/alpaca' # 输入文件夹路径
  33. output_file1 = './book/all/merged.json' # 输出文件名
  34. output_test_file1 = './book/all/test/merged_test.json' # 测试输出文件名
  35. # 第一次执行 merge_json_files
  36. await merge_json_files(input_folder1, output_file1, output_test_file1)
  37. # 第二次执行的输入文件夹和输出文件名
  38. input_folder2 = './book/all/test/' # 输入文件夹路径
  39. output_file2 = './book/all/shuiwuTest.json' # 输出文件名
  40. # 第二次执行 merge_json_files
  41. await merge_json_files(input_folder2, output_file2)
  42. # 通过迭代修改的样本去除旧的错误样本
  43. async def checkLoop(input_folder, check_file):
  44. all_data = []
  45. loop_lines = []
  46. all_wen = set()
  47. testall_wen = []
  48. # 读取 check_file 并解析 JSON 字符串
  49. async with aiofiles.open(check_file, 'r', encoding='utf-8') as f_in:
  50. # async for item in f_in:
  51. # try:
  52. # json_obj = json.loads(item)
  53. # except json.JSONDecodeError:
  54. # continue
  55. #
  56. # first_element = json_obj[0] if json_obj else None
  57. # if first_element:
  58. # all_wen.add(first_element)
  59. # check数据文件如果是一个json集合
  60. # 读取文件内容
  61. file_content = await f_in.read()
  62. # 将读取的字符串内容转换为JSON对象
  63. all_wen = json.loads(file_content)
  64. print(f"修改数据 {len(all_wen)}")
  65. # 遍历输入文件夹中的所有 JSON 文件
  66. for filename in os.listdir(input_folder):
  67. if filename.endswith('.json'):
  68. file_path = os.path.join(input_folder, filename)
  69. async with aiofiles.open(file_path, 'r', encoding='utf-8') as f:
  70. try:
  71. dataList = json.loads(await f.read())
  72. except json.JSONDecodeError:
  73. continue
  74. for item in dataList:
  75. testall_wen.append(item)
  76. instruction = item.get('instruction', None)
  77. if instruction in all_wen:
  78. loop_lines.append(item)
  79. continue
  80. all_data.append(item)
  81. print(f"总数据 {len(all_data)},所有的数据{len(testall_wen)}")
  82. print(f"重复数据 {len(loop_lines)}")
  83. result = {
  84. "all_book": all_data,
  85. "loop": loop_lines,
  86. }
  87. return result
  88. #执行过滤重复函数
  89. def to_filter_loop():
  90. input_folder = './book/all_book/'
  91. check_file = './book/过滤/queries.json'
  92. result_data = asyncio.run(checkLoop(input_folder,check_file))
  93. checkBad.write_files("./book/filter", result_data,True)
  94. if __name__ == "__main__":
  95. # 运行主函数
  96. # asyncio.run(main())
  97. to_filter_loop()