@@ -0,0 +1,331 @@
+#查找当前文件所有json文件 输出到一个文件夹每个会话对象换行不在一个集合里面
+# import json
+# import os
+# def convert_json_files(input_folder):
+# all_conversations = []
+# for root, dirs, files in os.walk(input_folder):
+# for file_name in files:
+# if file_name.endswith('.json'):
+# input_file = os.path.join(root, file_name)
+# with open(input_file, 'r', encoding='utf-8') as f_in:
+# for line in f_in:
+# if not line.strip():
+# continue
+# try:
+# conversations = json.loads(line)
+# conversation_obj = {"conversations": []}
+# for i in range(0, len(conversations), 2):
+# if i == 0:
+# conversation_obj["conversations"].append({"role": "system", "content": "你是污水处理厂技术专家,针对用户提出的问题,提供专业见解和建议并且清晰有条理,解决用户提出的问题"})
+# conversation_obj["conversations"].append({"role": "user", "content": conversations[i].lstrip("问:")})
+# conversation_obj["conversations"].append({"role": "assistant", "content": conversations[i+1].lstrip("答:")})
+# all_conversations.append(conversation_obj)
+# except json.JSONDecodeError:
+# print("Error: Invalid JSON format in file:", input_file)
+# continue
+# return all_conversations
+# #指定输入文件夹
+# input_folder = '/Users/yushanghui/hongshantianping/ai训练/data/optimizationData'
+# all_conversations = convert_json_files(input_folder)
+# # # 指定输出文件
+# output_file = '/Users/yushanghui/hongshantianping/ai训练/data/lineJson/2024.4.09.json'
+# # 将所有对话写入输出文件
+# with open(output_file, 'w', encoding='utf-8') as f_out:
+# for conversation_obj in all_conversations:
+# json.dump(conversation_obj, f_out, ensure_ascii=False)
+# f_out.write('\n')
+# import json
+# import os
+# def convert_json_files(input_folder):
+# all_conversations = []
+# special_lines = []
+# id = 0
+# for root, dirs, files in os.walk(input_folder):
+# for file_name in files:
+# if file_name.endswith('.json'):
+# input_file = os.path.join(root, file_name)
+# with open(input_file, 'r', encoding='utf-8') as f_in:
+# for line in f_in:
+# if "如图所示:" in line:
+# special_lines.append(line)
+# continue
+# if not line.strip():
+# continue
+# try:
+# conversations = json.loads(line)
+# id += 1
+# conversation_obj = {"id":"identity_" + str(id),"conversations": []}
+# for i in range(0, len(conversations), 2):
+# # if i == 0:
+# # conversation_obj["conversations"].append({"role": "system","content": ""})
+# conversation_obj["conversations"].append(
+# {"from": "user", "value": conversations[i].lstrip("问:")})
+# conversation_obj["conversations"].append(
+# {"from": "assistant", "value": conversations[i + 1].lstrip("答:")})
+# all_conversations.append(conversation_obj)
+# except json.JSONDecodeError:
+# print("Error: Invalid JSON format in file:", input_file)
+# continue
+# # Write special lines to a separate JSON file
+# if special_lines:
+# special_file = '/Users/yushanghui/hongshantianping/ai训练/data/lineJson/special_file.json' # Define the path for the special file
+# with open(special_file, 'w', encoding='utf-8') as special_out:
+# for line in special_lines:
+# special_out.write(line)
+# return all_conversations
+# # 指定输入文件夹
+# input_folder = '/Users/yushanghui/hongshantianping/ai训练/data/optimizationData'
+# all_conversations = convert_json_files(input_folder)
+# # 指定输出文件
+# output_file = '/Users/yushanghui/hongshantianping/ai训练/data/lineJson/hongshanData_water.json'
+# # 将所有对话写入输出文件
+# with open(output_file, 'w', encoding='utf-8') as f_out:
+# json.dump(all_conversations, f_out, ensure_ascii=False, indent=4)
+# 输出到一个集合里面 并且格式化
+# import json
+# import os
+# def convert_json_files(input_folder):
+# all_conversations = []
+# for file_name in os.listdir(input_folder):
+# if file_name.endswith('.json'):
+# input_file = os.path.join(input_folder, file_name)
+# with open(input_file, 'r', encoding='utf-8') as f_in:
+# for line in f_in:
+# try:
+# conversations = json.loads(line)
+# conversation_obj = {"conversations": []}
+# for i in range(0, len(conversations), 2):
+# if i == 0:
+# conversation_obj["conversations"].append({"role": "system", "content": "假设你是一个污水处理厂技术专家,你要回答用户询问的问题."})
+# conversation_obj["conversations"].append({"role": "user", "content": conversations[i].lstrip("问:")})
+# conversation_obj["conversations"].append({"role": "assistant", "content": conversations[i+1].lstrip("答:")})
+# all_conversations.append(conversation_obj)
+# except json.JSONDecodeError:
+# print("Error: Invalid JSON format in file:", input_file)
+# continue
+# return all_conversations
+# # 指定输入文件夹
+# input_folder = '/Users/yushanghui/hongshantianping/ai训练/data/jsondata/hebing'
+# all_conversations = convert_json_files(input_folder)
+# # 指定输出文件
+# output_file = '/Users/yushanghui/hongshantianping/ai训练/data/lineJson/output.json'
+# # 将所有对话写入输出文件并格式化
+# with open(output_file, 'w', encoding='utf-8') as f_out:
+# json.dump(all_conversations, f_out, ensure_ascii=False, indent=4)
+# 输出的内容是数组并且每个对象会添加,以后换行
+# import json
+# import os
+# def convert_json_files(input_folder):
+# all_conversations = []
+# for file_name in os.listdir(input_folder):
+# if file_name.endswith('.json'):
+# input_file = os.path.join(input_folder, file_name)
+# with open(input_file, 'r', encoding='utf-8') as f_in:
+# for line in f_in:
+# try:
+# conversations = json.loads(line)
+# conversation_obj = {"conversations": []}
+# for i in range(0, len(conversations), 2):
+# if i == 0:
+# conversation_obj["conversations"].append({"role": "system", "content": "假设你是一个污水处理厂技术专家,你要回答用户询问的问题."})
+# conversation_obj["conversations"].append({"role": "user", "content": conversations[i].lstrip("问:")})
+# conversation_obj["conversations"].append({"role": "assistant", "content": conversations[i+1].lstrip("答:")})
+# all_conversations.append(conversation_obj)
+# except json.JSONDecodeError:
+# print("Error: Invalid JSON format in file:", input_file)
+# continue
+# return all_conversations
+# # 指定输入文件夹
+# input_folder = '/Users/yushanghui/hongshantianping/ai训练/data/jsondata/hebing'
+# all_conversations = convert_json_files(input_folder)
+# # 指定输出文件
+# output_file = '/Users/yushanghui/hongshantianping/ai训练/data/lineJson/output.json'
+# # 将所有对话写入输出文件并格式化
+# with open(output_file, 'w', encoding='utf-8') as f_out:
+# f_out.write("[\n") # 开始列表
+# for i, conversation_obj in enumerate(all_conversations):
+# json.dump(conversation_obj, f_out, ensure_ascii=False)
+# # f_out.write('\n') # 每个对话对象后添加换行符
+# if i < len(all_conversations) - 1:
+# f_out.write(",\n") # 在除了最后一个对话对象之后添加逗号和换行符
+# f_out.write("]\n") # 结束列表
+# qwen训练样本
+# import json
+# import os
+# def convert_json_files(input_folder):
+# all_conversations = []
+# special_lines = []
+# for root, dirs, files in os.walk(input_folder):
+# for file_name in files:
+# if file_name.endswith('.json'):
+# input_file = os.path.join(root, file_name)
+# with open(input_file, 'r', encoding='utf-8') as f_in:
+# for line in f_in:
+# if "如图所示:" in line:
+# special_lines.append(line)
+# continue
+# if not line.strip():
+# continue
+# try:
+# conversations = json.loads(line)
+# conversation_obj = {"conversations": []}
+# print(len(len(conversations)))
+# for i in range(0, len(conversations), 2):
+# # if i == 0:
+# # conversation_obj["conversations"].append({"role": "system","content": ""})
+# conversation_obj["conversations"].append(
+# {"role": "user", "content": conversations[i].lstrip("问:")})
+# conversation_obj["conversations"].append(
+# {"role": "assistant", "content": conversations[i + 1].lstrip("答:")})
+# all_conversations.append(conversation_obj)
+# except json.JSONDecodeError:
+# print("Error: Invalid JSON format in file:", input_file)
+# continue
+# # Write special lines to a separate JSON file
+# if special_lines:
+# special_file = '/Users/yushanghui/hongshantianping/ai训练/data/lineJson/special_file.json' # Define the path for the special file
+# with open(special_file, 'w', encoding='utf-8') as special_out:
+# for line in special_lines:
+# special_out.write(line)
+# return all_conversations
+# # 指定输入文件夹
+# input_folder = '/Users/yushanghui/hongshantianping/ai训练/data/optimizationData'
+# all_conversations = convert_json_files(input_folder)
+# # 指定输出文件
+# output_file = '/Users/yushanghui/hongshantianping/ai训练/data/lineJson/hongshanData.json'
+# # 将所有对话写入输出文件
+# # with open(output_file, 'w', encoding='utf-8') as f_out:
+# # json.dump(all_conversation, f_out, ensure_ascii=False, indent=4)
+# with open(output_file, 'w', encoding='utf-8') as f_out:
+# for conversation_obj in all_conversations:
+# json.dump(conversation_obj, f_out, ensure_ascii=False)
+# f_out.write('\n')
+# qwen训练样本
+import json
+import os
+import random
+def convert_json_files(input_folder):
+ all_conversations = []
+ special_lines = []
+ for root, dirs, files in os.walk(input_folder):
+ for file_name in files:
+ if file_name.endswith('.json'):
+ input_file = os.path.join(root, file_name)
+ with open(input_file, 'r', encoding='utf-8') as f_in:
+ for line in f_in:
+ if "如图所示:" in line:
+ special_lines.append(line)
+ continue
+ if not line.strip():
+ continue
+ try:
+ conversations = json.loads(line)
+ conversation_obj = {
+ "instruction": "",
+ "input": "",
+ "output": "",
+ "history": []
+ }
+ for i in range(0, len(conversations), 2):
+ # if i == 0:
+ # conversation_obj["conversations"].append({"role": "system","content": ""})
+ if len(conversations)==2:
+ conversation_obj["instruction"]=conversations[i].lstrip("问:")
+ conversation_obj["output"]= conversations[i + 1].lstrip("答:")
+ elif len(conversations)>2:
+ # todo 如果当前 i小于len(conversations)的最后两个执行下面的操作
+ if i < len(conversations)-2:
+ history = []
+ history.append(conversations[i].lstrip("问:"))
+ history.append(conversations[i + 1].lstrip("答:"))
+ conversation_obj["history"].append(history)
+ # todo 如果当前 i等于len(conversations)的最后两个元素执行下面的操作
+ elif i == len(conversations) - 2: # 倒数第二个对话
+ conversation_obj["instruction"]=conversations[i].lstrip("问:")
+ conversation_obj["output"]= conversations[i + 1].lstrip("答:")
+ all_conversations.append(conversation_obj)
+ except json.JSONDecodeError:
+ print("Error: Invalid JSON format in file:", input_file)
+ continue
+ # Write special lines to a separate JSON file
+ if special_lines:
+ special_file = '/Users/yushanghui/hongshantianping/ai训练/data/lineJson/special_file.json' # Define the path for the special file
+ with open(special_file, 'w', encoding='utf-8') as special_out:
+ for line in special_lines:
+ special_out.write(line)
+ return all_conversations
+# 指定输入文件夹
+input_folder = '/Users/yushanghui/hongshantianping/ai训练/data/孙浩prompt'
+all_conversations = convert_json_files(input_folder)
+# 指定输出文件
+output_file = '/Users/yushanghui/hongshantianping/ai训练/data/lineJson/merged.json'
+# 将所有对话写入输出文件 随机打乱数据
+with open(output_file, 'w', encoding='utf-8') as f_out:
+ json.dump(all_conversations, f_out, ensure_ascii=False, indent=4)
+# with open(output_file, 'w', encoding='utf-8') as f_out:
+# for conversation_obj in all_conversations:
+# json.dump(conversation_obj, f_out, ensure_ascii=False)
+# f_out.write('\n')