filter_id.py 2.7 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788
  1. import json
  2. import os
  3. # 根据id过滤错误数据 尽量不用这个 因为id不固定
  4. def extract_queries(a_file, b_file, output_file):
  5. # 读取a.json
  6. with open(a_file, 'r') as file:
  7. a_ids = set(json.load(file))
  8. # 读取b.json
  9. with open(b_file, 'r') as file:
  10. b_objects = json.load(file)
  11. print(len(b_objects),len(a_ids))
  12. queries = []
  13. # 提取query
  14. # queries = [obj['instruction'] for obj in b_objects if obj['id'] in a_ids]
  15. result = []
  16. for obj in b_objects:
  17. if obj['id'] in a_ids:
  18. queries.append(obj['instruction'])
  19. continue
  20. result.append(obj)
  21. # 保存结果到新文件
  22. with open(output_file, 'w', encoding="utf-8") as file:
  23. json.dump(queries, file, ensure_ascii=False, indent=4)
  24. return result
  25. # 调用函数
  26. # a_file = './book/过滤/all_id.json'
  27. # b_file = './book/all/merged.json'
  28. # output_file = './book/过滤/queries.json'
  29. # result = extract_queries(a_file, b_file, output_file)
  30. # print(len(result))
  31. def check_loop(a_file, check_file, check_folder,output_file):
  32. result = []
  33. loop = []
  34. # 读取a_file中的JSON数据
  35. with open(a_file, 'r', encoding='utf-8') as file:
  36. a_asks = set(json.load(file))
  37. print(len(a_asks))
  38. # # 读取check_file中的数据
  39. with open(check_file, 'r', encoding='utf-8') as file:
  40. try:
  41. file = json.load(file)
  42. print(len(file))
  43. for line in file:
  44. a_asks.add(line['instruction'])
  45. except json.JSONDecodeError:
  46. print(f"Error decoding JSON in file: {file_path}")
  47. # 遍历check_folder中的所有JSON文件
  48. for filename in os.listdir(check_folder):
  49. if filename.endswith('.json'):
  50. file_path = os.path.join(check_folder, filename)
  51. with open(file_path, 'r', encoding='utf-8') as f_in:
  52. try:
  53. items = json.load(f_in)
  54. print(len(items))
  55. for item in items:
  56. if item['instruction'] in a_asks :
  57. loop.append(item)
  58. else:
  59. result.append(item)
  60. except json.JSONDecodeError:
  61. print(f"Error decoding JSON in file: {file_path}")
  62. with open(output_file, 'w', encoding="utf-8") as file:
  63. json.dump(result, file, ensure_ascii=False, indent=4)
  64. return result
  65. a_file = './book/过滤/queries.json'
  66. b_file = './book/alpaca/diedaiEdit.json'
  67. check_folder = './book/all_book/'
  68. output_file = './book/alpaca/book_filter.json'
  69. result = check_loop(a_file, b_file, check_folder,output_file)
  70. print(len(result))