12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788 |
- import json
- import os
- # 根据id过滤错误数据 尽量不用这个 因为id不固定
- def extract_queries(a_file, b_file, output_file):
- # 读取a.json
- with open(a_file, 'r') as file:
- a_ids = set(json.load(file))
-
- # 读取b.json
- with open(b_file, 'r') as file:
- b_objects = json.load(file)
- print(len(b_objects),len(a_ids))
- queries = []
- # 提取query
- # queries = [obj['instruction'] for obj in b_objects if obj['id'] in a_ids]
- result = []
- for obj in b_objects:
- if obj['id'] in a_ids:
- queries.append(obj['instruction'])
- continue
- result.append(obj)
-
- # 保存结果到新文件
- with open(output_file, 'w', encoding="utf-8") as file:
- json.dump(queries, file, ensure_ascii=False, indent=4)
-
- return result
- # 调用函数
- # a_file = './book/过滤/all_id.json'
- # b_file = './book/all/merged.json'
- # output_file = './book/过滤/queries.json'
- # result = extract_queries(a_file, b_file, output_file)
- # print(len(result))
- def check_loop(a_file, check_file, check_folder,output_file):
- result = []
- loop = []
- # 读取a_file中的JSON数据
- with open(a_file, 'r', encoding='utf-8') as file:
- a_asks = set(json.load(file))
- print(len(a_asks))
- # # 读取check_file中的数据
- with open(check_file, 'r', encoding='utf-8') as file:
- try:
- file = json.load(file)
- print(len(file))
- for line in file:
- a_asks.add(line['instruction'])
- except json.JSONDecodeError:
- print(f"Error decoding JSON in file: {file_path}")
-
-
- # 遍历check_folder中的所有JSON文件
- for filename in os.listdir(check_folder):
- if filename.endswith('.json'):
- file_path = os.path.join(check_folder, filename)
- with open(file_path, 'r', encoding='utf-8') as f_in:
- try:
- items = json.load(f_in)
- print(len(items))
- for item in items:
- if item['instruction'] in a_asks :
- loop.append(item)
- else:
- result.append(item)
- except json.JSONDecodeError:
- print(f"Error decoding JSON in file: {file_path}")
-
- with open(output_file, 'w', encoding="utf-8") as file:
- json.dump(result, file, ensure_ascii=False, indent=4)
-
- return result
- a_file = './book/过滤/queries.json'
- b_file = './book/alpaca/diedaiEdit.json'
- check_folder = './book/all_book/'
- output_file = './book/alpaca/book_filter.json'
- result = check_loop(a_file, b_file, check_folder,output_file)
- print(len(result))
|