|
@@ -0,0 +1,331 @@
|
|
|
+#查找当前文件所有json文件 输出到一个文件夹每个会话对象换行不在一个集合里面
|
|
|
+# import json
|
|
|
+# import os
|
|
|
+#
|
|
|
+# def convert_json_files(input_folder):
|
|
|
+# all_conversations = []
|
|
|
+# for root, dirs, files in os.walk(input_folder):
|
|
|
+# for file_name in files:
|
|
|
+# if file_name.endswith('.json'):
|
|
|
+# input_file = os.path.join(root, file_name)
|
|
|
+# with open(input_file, 'r', encoding='utf-8') as f_in:
|
|
|
+# for line in f_in:
|
|
|
+# if not line.strip():
|
|
|
+# continue
|
|
|
+# try:
|
|
|
+# conversations = json.loads(line)
|
|
|
+# conversation_obj = {"conversations": []}
|
|
|
+# for i in range(0, len(conversations), 2):
|
|
|
+# if i == 0:
|
|
|
+# conversation_obj["conversations"].append({"role": "system", "content": "你是污水处理厂技术专家,针对用户提出的问题,提供专业见解和建议并且清晰有条理,解决用户提出的问题"})
|
|
|
+# conversation_obj["conversations"].append({"role": "user", "content": conversations[i].lstrip("问:")})
|
|
|
+# conversation_obj["conversations"].append({"role": "assistant", "content": conversations[i+1].lstrip("答:")})
|
|
|
+# all_conversations.append(conversation_obj)
|
|
|
+# except json.JSONDecodeError:
|
|
|
+# print("Error: Invalid JSON format in file:", input_file)
|
|
|
+# continue
|
|
|
+# return all_conversations
|
|
|
+#
|
|
|
+# #指定输入文件夹
|
|
|
+# input_folder = '/Users/yushanghui/hongshantianping/ai训练/data/optimizationData'
|
|
|
+#
|
|
|
+# all_conversations = convert_json_files(input_folder)
|
|
|
+#
|
|
|
+# # # 指定输出文件
|
|
|
+# output_file = '/Users/yushanghui/hongshantianping/ai训练/data/lineJson/2024.4.09.json'
|
|
|
+#
|
|
|
+# # 将所有对话写入输出文件
|
|
|
+# with open(output_file, 'w', encoding='utf-8') as f_out:
|
|
|
+# for conversation_obj in all_conversations:
|
|
|
+# json.dump(conversation_obj, f_out, ensure_ascii=False)
|
|
|
+# f_out.write('\n')
|
|
|
+
|
|
|
+
|
|
|
+#这个可以查找文件夹里面所有的json文件并过滤
|
|
|
+# import json
|
|
|
+# import os
|
|
|
+#
|
|
|
+#
|
|
|
+# def convert_json_files(input_folder):
|
|
|
+# all_conversations = []
|
|
|
+# special_lines = []
|
|
|
+# id = 0
|
|
|
+# for root, dirs, files in os.walk(input_folder):
|
|
|
+# for file_name in files:
|
|
|
+# if file_name.endswith('.json'):
|
|
|
+# input_file = os.path.join(root, file_name)
|
|
|
+# with open(input_file, 'r', encoding='utf-8') as f_in:
|
|
|
+# for line in f_in:
|
|
|
+# if "如图所示:" in line:
|
|
|
+# special_lines.append(line)
|
|
|
+# continue
|
|
|
+# if not line.strip():
|
|
|
+# continue
|
|
|
+# try:
|
|
|
+# conversations = json.loads(line)
|
|
|
+# id += 1
|
|
|
+# conversation_obj = {"id":"identity_" + str(id),"conversations": []}
|
|
|
+#
|
|
|
+# for i in range(0, len(conversations), 2):
|
|
|
+# # if i == 0:
|
|
|
+# # conversation_obj["conversations"].append({"role": "system","content": ""})
|
|
|
+# conversation_obj["conversations"].append(
|
|
|
+# {"from": "user", "value": conversations[i].lstrip("问:")})
|
|
|
+# conversation_obj["conversations"].append(
|
|
|
+# {"from": "assistant", "value": conversations[i + 1].lstrip("答:")})
|
|
|
+# all_conversations.append(conversation_obj)
|
|
|
+# except json.JSONDecodeError:
|
|
|
+# print("Error: Invalid JSON format in file:", input_file)
|
|
|
+# continue
|
|
|
+#
|
|
|
+# # Write special lines to a separate JSON file
|
|
|
+# if special_lines:
|
|
|
+# special_file = '/Users/yushanghui/hongshantianping/ai训练/data/lineJson/special_file.json' # Define the path for the special file
|
|
|
+# with open(special_file, 'w', encoding='utf-8') as special_out:
|
|
|
+# for line in special_lines:
|
|
|
+# special_out.write(line)
|
|
|
+#
|
|
|
+# return all_conversations
|
|
|
+#
|
|
|
+#
|
|
|
+# # 指定输入文件夹
|
|
|
+# input_folder = '/Users/yushanghui/hongshantianping/ai训练/data/optimizationData'
|
|
|
+#
|
|
|
+# all_conversations = convert_json_files(input_folder)
|
|
|
+#
|
|
|
+# # 指定输出文件
|
|
|
+# output_file = '/Users/yushanghui/hongshantianping/ai训练/data/lineJson/hongshanData_water.json'
|
|
|
+#
|
|
|
+# # 将所有对话写入输出文件
|
|
|
+#
|
|
|
+# with open(output_file, 'w', encoding='utf-8') as f_out:
|
|
|
+# json.dump(all_conversations, f_out, ensure_ascii=False, indent=4)
|
|
|
+
|
|
|
+
|
|
|
+# 输出到一个集合里面 并且格式化
|
|
|
+# import json
|
|
|
+# import os
|
|
|
+
|
|
|
+# def convert_json_files(input_folder):
|
|
|
+# all_conversations = []
|
|
|
+# for file_name in os.listdir(input_folder):
|
|
|
+# if file_name.endswith('.json'):
|
|
|
+# input_file = os.path.join(input_folder, file_name)
|
|
|
+# with open(input_file, 'r', encoding='utf-8') as f_in:
|
|
|
+# for line in f_in:
|
|
|
+# try:
|
|
|
+# conversations = json.loads(line)
|
|
|
+# conversation_obj = {"conversations": []}
|
|
|
+# for i in range(0, len(conversations), 2):
|
|
|
+# if i == 0:
|
|
|
+# conversation_obj["conversations"].append({"role": "system", "content": "假设你是一个污水处理厂技术专家,你要回答用户询问的问题."})
|
|
|
+# conversation_obj["conversations"].append({"role": "user", "content": conversations[i].lstrip("问:")})
|
|
|
+# conversation_obj["conversations"].append({"role": "assistant", "content": conversations[i+1].lstrip("答:")})
|
|
|
+# all_conversations.append(conversation_obj)
|
|
|
+# except json.JSONDecodeError:
|
|
|
+# print("Error: Invalid JSON format in file:", input_file)
|
|
|
+# continue
|
|
|
+# return all_conversations
|
|
|
+
|
|
|
+# # 指定输入文件夹
|
|
|
+# input_folder = '/Users/yushanghui/hongshantianping/ai训练/data/jsondata/hebing'
|
|
|
+
|
|
|
+# all_conversations = convert_json_files(input_folder)
|
|
|
+
|
|
|
+# # 指定输出文件
|
|
|
+# output_file = '/Users/yushanghui/hongshantianping/ai训练/data/lineJson/output.json'
|
|
|
+
|
|
|
+# # 将所有对话写入输出文件并格式化
|
|
|
+# with open(output_file, 'w', encoding='utf-8') as f_out:
|
|
|
+# json.dump(all_conversations, f_out, ensure_ascii=False, indent=4)
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+# 输出的内容是数组并且每个对象会添加,以后换行
|
|
|
+# import json
|
|
|
+# import os
|
|
|
+
|
|
|
+# def convert_json_files(input_folder):
|
|
|
+# all_conversations = []
|
|
|
+# for file_name in os.listdir(input_folder):
|
|
|
+# if file_name.endswith('.json'):
|
|
|
+# input_file = os.path.join(input_folder, file_name)
|
|
|
+# with open(input_file, 'r', encoding='utf-8') as f_in:
|
|
|
+# for line in f_in:
|
|
|
+# try:
|
|
|
+# conversations = json.loads(line)
|
|
|
+# conversation_obj = {"conversations": []}
|
|
|
+# for i in range(0, len(conversations), 2):
|
|
|
+# if i == 0:
|
|
|
+# conversation_obj["conversations"].append({"role": "system", "content": "假设你是一个污水处理厂技术专家,你要回答用户询问的问题."})
|
|
|
+# conversation_obj["conversations"].append({"role": "user", "content": conversations[i].lstrip("问:")})
|
|
|
+# conversation_obj["conversations"].append({"role": "assistant", "content": conversations[i+1].lstrip("答:")})
|
|
|
+# all_conversations.append(conversation_obj)
|
|
|
+# except json.JSONDecodeError:
|
|
|
+# print("Error: Invalid JSON format in file:", input_file)
|
|
|
+# continue
|
|
|
+# return all_conversations
|
|
|
+
|
|
|
+# # 指定输入文件夹
|
|
|
+# input_folder = '/Users/yushanghui/hongshantianping/ai训练/data/jsondata/hebing'
|
|
|
+
|
|
|
+# all_conversations = convert_json_files(input_folder)
|
|
|
+
|
|
|
+# # 指定输出文件
|
|
|
+# output_file = '/Users/yushanghui/hongshantianping/ai训练/data/lineJson/output.json'
|
|
|
+
|
|
|
+# # 将所有对话写入输出文件并格式化
|
|
|
+# with open(output_file, 'w', encoding='utf-8') as f_out:
|
|
|
+# f_out.write("[\n") # 开始列表
|
|
|
+# for i, conversation_obj in enumerate(all_conversations):
|
|
|
+# json.dump(conversation_obj, f_out, ensure_ascii=False)
|
|
|
+# # f_out.write('\n') # 每个对话对象后添加换行符
|
|
|
+# if i < len(all_conversations) - 1:
|
|
|
+# f_out.write(",\n") # 在除了最后一个对话对象之后添加逗号和换行符
|
|
|
+# f_out.write("]\n") # 结束列表
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+# qwen训练样本
|
|
|
+# import json
|
|
|
+# import os
|
|
|
+
|
|
|
+# def convert_json_files(input_folder):
|
|
|
+# all_conversations = []
|
|
|
+# special_lines = []
|
|
|
+# for root, dirs, files in os.walk(input_folder):
|
|
|
+# for file_name in files:
|
|
|
+# if file_name.endswith('.json'):
|
|
|
+# input_file = os.path.join(root, file_name)
|
|
|
+# with open(input_file, 'r', encoding='utf-8') as f_in:
|
|
|
+# for line in f_in:
|
|
|
+# if "如图所示:" in line:
|
|
|
+# special_lines.append(line)
|
|
|
+# continue
|
|
|
+# if not line.strip():
|
|
|
+# continue
|
|
|
+# try:
|
|
|
+# conversations = json.loads(line)
|
|
|
+# conversation_obj = {"conversations": []}
|
|
|
+# print(len(len(conversations)))
|
|
|
+# for i in range(0, len(conversations), 2):
|
|
|
+# # if i == 0:
|
|
|
+# # conversation_obj["conversations"].append({"role": "system","content": ""})
|
|
|
+# conversation_obj["conversations"].append(
|
|
|
+# {"role": "user", "content": conversations[i].lstrip("问:")})
|
|
|
+# conversation_obj["conversations"].append(
|
|
|
+# {"role": "assistant", "content": conversations[i + 1].lstrip("答:")})
|
|
|
+# all_conversations.append(conversation_obj)
|
|
|
+# except json.JSONDecodeError:
|
|
|
+# print("Error: Invalid JSON format in file:", input_file)
|
|
|
+# continue
|
|
|
+
|
|
|
+# # Write special lines to a separate JSON file
|
|
|
+# if special_lines:
|
|
|
+# special_file = '/Users/yushanghui/hongshantianping/ai训练/data/lineJson/special_file.json' # Define the path for the special file
|
|
|
+# with open(special_file, 'w', encoding='utf-8') as special_out:
|
|
|
+# for line in special_lines:
|
|
|
+# special_out.write(line)
|
|
|
+
|
|
|
+# return all_conversations
|
|
|
+
|
|
|
+
|
|
|
+# # 指定输入文件夹
|
|
|
+# input_folder = '/Users/yushanghui/hongshantianping/ai训练/data/optimizationData'
|
|
|
+
|
|
|
+# all_conversations = convert_json_files(input_folder)
|
|
|
+
|
|
|
+# # 指定输出文件
|
|
|
+# output_file = '/Users/yushanghui/hongshantianping/ai训练/data/lineJson/hongshanData.json'
|
|
|
+
|
|
|
+# # 将所有对话写入输出文件
|
|
|
+
|
|
|
+# # with open(output_file, 'w', encoding='utf-8') as f_out:
|
|
|
+# # json.dump(all_conversation, f_out, ensure_ascii=False, indent=4)
|
|
|
+
|
|
|
+# with open(output_file, 'w', encoding='utf-8') as f_out:
|
|
|
+# for conversation_obj in all_conversations:
|
|
|
+# json.dump(conversation_obj, f_out, ensure_ascii=False)
|
|
|
+# f_out.write('\n')
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+# qwen训练样本
|
|
|
+import json
|
|
|
+import os
|
|
|
+import random
|
|
|
+def convert_json_files(input_folder):
|
|
|
+ all_conversations = []
|
|
|
+ special_lines = []
|
|
|
+ for root, dirs, files in os.walk(input_folder):
|
|
|
+ for file_name in files:
|
|
|
+ if file_name.endswith('.json'):
|
|
|
+ input_file = os.path.join(root, file_name)
|
|
|
+ with open(input_file, 'r', encoding='utf-8') as f_in:
|
|
|
+ for line in f_in:
|
|
|
+ if "如图所示:" in line:
|
|
|
+ special_lines.append(line)
|
|
|
+ continue
|
|
|
+ if not line.strip():
|
|
|
+ continue
|
|
|
+ try:
|
|
|
+ conversations = json.loads(line)
|
|
|
+ conversation_obj = {
|
|
|
+ "instruction": "",
|
|
|
+ "input": "",
|
|
|
+ "output": "",
|
|
|
+ "history": []
|
|
|
+ }
|
|
|
+ for i in range(0, len(conversations), 2):
|
|
|
+ # if i == 0:
|
|
|
+ # conversation_obj["conversations"].append({"role": "system","content": ""})
|
|
|
+ if len(conversations)==2:
|
|
|
+ conversation_obj["instruction"]=conversations[i].lstrip("问:")
|
|
|
+ conversation_obj["output"]= conversations[i + 1].lstrip("答:")
|
|
|
+ elif len(conversations)>2:
|
|
|
+ # todo 如果当前 i小于len(conversations)的最后两个执行下面的操作
|
|
|
+ if i < len(conversations)-2:
|
|
|
+ history = []
|
|
|
+ history.append(conversations[i].lstrip("问:"))
|
|
|
+ history.append(conversations[i + 1].lstrip("答:"))
|
|
|
+ conversation_obj["history"].append(history)
|
|
|
+ # todo 如果当前 i等于len(conversations)的最后两个元素执行下面的操作
|
|
|
+ elif i == len(conversations) - 2: # 倒数第二个对话
|
|
|
+ conversation_obj["instruction"]=conversations[i].lstrip("问:")
|
|
|
+ conversation_obj["output"]= conversations[i + 1].lstrip("答:")
|
|
|
+ all_conversations.append(conversation_obj)
|
|
|
+
|
|
|
+ except json.JSONDecodeError:
|
|
|
+ print("Error: Invalid JSON format in file:", input_file)
|
|
|
+ continue
|
|
|
+
|
|
|
+ # Write special lines to a separate JSON file
|
|
|
+ if special_lines:
|
|
|
+ special_file = '/Users/yushanghui/hongshantianping/ai训练/data/lineJson/special_file.json' # Define the path for the special file
|
|
|
+ with open(special_file, 'w', encoding='utf-8') as special_out:
|
|
|
+ for line in special_lines:
|
|
|
+ special_out.write(line)
|
|
|
+
|
|
|
+ return all_conversations
|
|
|
+
|
|
|
+
|
|
|
+# 指定输入文件夹
|
|
|
+input_folder = '/Users/yushanghui/hongshantianping/ai训练/data/孙浩prompt'
|
|
|
+
|
|
|
+all_conversations = convert_json_files(input_folder)
|
|
|
+print(len(all_conversations))
|
|
|
+# 指定输出文件
|
|
|
+output_file = '/Users/yushanghui/hongshantianping/ai训练/data/lineJson/merged.json'
|
|
|
+
|
|
|
+# 将所有对话写入输出文件 随机打乱数据
|
|
|
+random.shuffle(all_conversations)
|
|
|
+
|
|
|
+with open(output_file, 'w', encoding='utf-8') as f_out:
|
|
|
+ json.dump(all_conversations, f_out, ensure_ascii=False, indent=4)
|
|
|
+
|
|
|
+# with open(output_file, 'w', encoding='utf-8') as f_out:
|
|
|
+# for conversation_obj in all_conversations:
|
|
|
+# json.dump(conversation_obj, f_out, ensure_ascii=False)
|
|
|
+# f_out.write('\n')
|