init.py 2.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859
  1. from jiexiExcel import write_data_to_file # 导入特定的函数
  2. from checkBad import beginCheckBadFun # excel文件夹路径
  3. from formatData import formatAlpaca # excel文件夹路径
  4. from merge import merge_json_files # 合并所有为一个训练脚本
  5. import json
  6. import asyncio
  7. from collections import OrderedDict
  8. # 清洗数据函数
  9. def formatDataFun():
  10. input_directory = '/Users/yushanghui/hongshantianping/git/dataTools/book/通用书籍/2024.8.16/'
  11. # 把excel文件转成json文件 并保存到指定路径
  12. output_file = "/Users/yushanghui/hongshantianping/git/dataTools/book/json/ty2024.8.16.json"
  13. # 通过checkBad.py 函数进行过滤指定输出文件夹
  14. out_file = '/Users/yushanghui/hongshantianping/git/dataTools/book/jsonOut/ty2024.8.16'
  15. alpaca_file = '/Users/yushanghui/hongshantianping/git/dataTools/book/alpaca/ty2024.8.16.json'
  16. write_data_to_file(input_directory, output_file)
  17. beginCheckBadFun(output_file, out_file)
  18. formatAlpaca(out_file, alpaca_file)
  19. # editFilter(alpaca_file)
  20. def remove_duplicates(input_file):
  21. with open(input_file, 'r', encoding='utf-8') as file:
  22. data = json.load(file)
  23. unique_data = OrderedDict()
  24. for entry in data:
  25. instruction = entry['instruction']
  26. output = entry['output']
  27. # 如果已经存在相同的instruction,比较时间,保留最新的
  28. if instruction in unique_data:
  29. existing_output = unique_data[instruction]['output']
  30. if len(output) > len(existing_output):
  31. unique_data[instruction] = entry
  32. else:
  33. unique_data[instruction] = entry
  34. # 返回去重后的数据
  35. return list(unique_data.values())
  36. def editFilter(alpaca_file):
  37. unique_data = remove_duplicates(alpaca_file)
  38. with open('./book/alpaca/new_diedaiEdit.json', 'w', encoding='utf-8') as file:
  39. json.dump(unique_data, file, ensure_ascii=False, indent=4)
  40. print(len(unique_data))
  41. async def mergedFun():
  42. input_folder1 = './book/alpaca/' # 输入文件夹路径
  43. output_file1 = './book/merged.json' # 输出文件名
  44. # 第一次执行 merge_json_files
  45. await merge_json_files(input_folder1, output_file1)
  46. if __name__ == "__main__":
  47. # asyncio.run(mergedFun())
  48. formatDataFun()