import json import os # 根据id过滤错误数据 尽量不用这个 因为id不固定 def extract_queries(a_file, b_file, output_file): # 读取a.json with open(a_file, 'r') as file: a_ids = set(json.load(file)) # 读取b.json with open(b_file, 'r') as file: b_objects = json.load(file) print(len(b_objects),len(a_ids)) queries = [] # 提取query # queries = [obj['instruction'] for obj in b_objects if obj['id'] in a_ids] result = [] for obj in b_objects: if obj['id'] in a_ids: queries.append(obj['instruction']) continue result.append(obj) # 保存结果到新文件 with open(output_file, 'w', encoding="utf-8") as file: json.dump(queries, file, ensure_ascii=False, indent=4) return result # 调用函数 # a_file = './book/过滤/all_id.json' # b_file = './book/all/merged.json' # output_file = './book/过滤/queries.json' # result = extract_queries(a_file, b_file, output_file) # print(len(result)) def check_loop(a_file, check_file, check_folder,output_file): result = [] loop = [] # 读取a_file中的JSON数据 with open(a_file, 'r', encoding='utf-8') as file: a_asks = set(json.load(file)) print(len(a_asks)) # # 读取check_file中的数据 with open(check_file, 'r', encoding='utf-8') as file: try: file = json.load(file) print(len(file)) for line in file: a_asks.add(line['instruction']) except json.JSONDecodeError: print(f"Error decoding JSON in file: {file_path}") # 遍历check_folder中的所有JSON文件 for filename in os.listdir(check_folder): if filename.endswith('.json'): file_path = os.path.join(check_folder, filename) with open(file_path, 'r', encoding='utf-8') as f_in: try: items = json.load(f_in) print(len(items)) for item in items: if item['instruction'] in a_asks : loop.append(item) else: result.append(item) except json.JSONDecodeError: print(f"Error decoding JSON in file: {file_path}") with open(output_file, 'w', encoding="utf-8") as file: json.dump(result, file, ensure_ascii=False, indent=4) return result a_file = './book/过滤/queries.json' b_file = './book/alpaca/diedaiEdit.json' check_folder = './book/all_book/' output_file = './book/alpaca/book_filter.json' result = check_loop(a_file, b_file, check_folder,output_file) print(len(result))