123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121 |
- import os
- import json
- import random
- import asyncio
- import aiofiles
- import checkBad
- async def merge_json_files(input_folder, output_file, output_test_file=None):
- all_data = []
- # 遍历输入文件夹中的所有文件
- for filename in os.listdir(input_folder):
- if filename.endswith('.json'):
- file_path = os.path.join(input_folder, filename)
- async with aiofiles.open(file_path, 'r', encoding='utf-8') as f:
- data = json.loads(await f.read())
- all_data.extend(data)
-
- # 随机化数据顺序
- random.shuffle(all_data)
-
- sample_data = []
- if output_test_file:
- # 抽取5000条数据或数据量不足时抽取所有数据
- sample_data = random.sample(all_data, min(5000, len(all_data)))
- print(f'all_data长度: {len(all_data)} , sample_data长度: {len(sample_data)}')
-
- # 将所有数据写入到输出文件中
- async with aiofiles.open(output_file, 'w', encoding='utf-8') as f_out:
- await f_out.write(json.dumps(all_data, ensure_ascii=False, indent=2))
-
- if sample_data:
- async with aiofiles.open(output_test_file, 'w', encoding='utf-8') as f_out:
- await f_out.write(json.dumps(sample_data, ensure_ascii=False, indent=2))
- # 生成测试集
- async def main():
- # 设置输入文件夹和输出文件名
- input_folder1 = './book/alpaca' # 输入文件夹路径
- output_file1 = './book/all/merged.json' # 输出文件名
- output_test_file1 = './book/all/test/merged_test.json' # 测试输出文件名
-
- # 第一次执行 merge_json_files
- await merge_json_files(input_folder1, output_file1, output_test_file1)
-
- # 第二次执行的输入文件夹和输出文件名
- input_folder2 = './book/all/test/' # 输入文件夹路径
- output_file2 = './book/all/shuiwuTest.json' # 输出文件名
-
- # 第二次执行 merge_json_files
- await merge_json_files(input_folder2, output_file2)
- # 通过迭代修改的样本去除旧的错误样本
- async def checkLoop(input_folder, check_file):
- all_data = []
- loop_lines = []
- all_wen = set()
- testall_wen = []
- # 读取 check_file 并解析 JSON 字符串
- async with aiofiles.open(check_file, 'r', encoding='utf-8') as f_in:
- # async for item in f_in:
- # try:
- # json_obj = json.loads(item)
- # except json.JSONDecodeError:
- # continue
- #
- # first_element = json_obj[0] if json_obj else None
- # if first_element:
- # all_wen.add(first_element)
- # check数据文件如果是一个json集合
- # 读取文件内容
- file_content = await f_in.read()
- # 将读取的字符串内容转换为JSON对象
- all_wen = json.loads(file_content)
-
- print(f"修改数据 {len(all_wen)}")
-
- # 遍历输入文件夹中的所有 JSON 文件
- for filename in os.listdir(input_folder):
- if filename.endswith('.json'):
- file_path = os.path.join(input_folder, filename)
- async with aiofiles.open(file_path, 'r', encoding='utf-8') as f:
- try:
- dataList = json.loads(await f.read())
- except json.JSONDecodeError:
- continue
- for item in dataList:
- testall_wen.append(item)
- instruction = item.get('instruction', None)
- if instruction in all_wen:
- loop_lines.append(item)
- continue
- all_data.append(item)
-
- print(f"总数据 {len(all_data)},所有的数据{len(testall_wen)}")
- print(f"重复数据 {len(loop_lines)}")
- result = {
- "all_book": all_data,
- "loop": loop_lines,
- }
- return result
- #执行过滤重复函数
- def to_filter_loop():
- input_folder = './book/all_book/'
- check_file = './book/过滤/queries.json'
- result_data = asyncio.run(checkLoop(input_folder,check_file))
- checkBad.write_files("./book/filter", result_data,True)
- if __name__ == "__main__":
- # 运行主函数
- # asyncio.run(main())
- to_filter_loop()
-
-
|