#查找当前文件所有json文件 输出到一个文件夹每个会话对象换行不在一个集合里面 # import json # import os # # def convert_json_files(input_folder): # all_conversations = [] # for root, dirs, files in os.walk(input_folder): # for file_name in files: # if file_name.endswith('.json'): # input_file = os.path.join(root, file_name) # with open(input_file, 'r', encoding='utf-8') as f_in: # for line in f_in: # if not line.strip(): # continue # try: # conversations = json.loads(line) # conversation_obj = {"conversations": []} # for i in range(0, len(conversations), 2): # if i == 0: # conversation_obj["conversations"].append({"role": "system", "content": "你是污水处理厂技术专家,针对用户提出的问题,提供专业见解和建议并且清晰有条理,解决用户提出的问题"}) # conversation_obj["conversations"].append({"role": "user", "content": conversations[i].lstrip("问:")}) # conversation_obj["conversations"].append({"role": "assistant", "content": conversations[i+1].lstrip("答:")}) # all_conversations.append(conversation_obj) # except json.JSONDecodeError: # print("Error: Invalid JSON format in file:", input_file) # continue # return all_conversations # # #指定输入文件夹 # input_folder = '/Users/yushanghui/hongshantianping/ai训练/data/optimizationData' # # all_conversations = convert_json_files(input_folder) # # # # 指定输出文件 # output_file = '/Users/yushanghui/hongshantianping/ai训练/data/lineJson/2024.4.09.json' # # # 将所有对话写入输出文件 # with open(output_file, 'w', encoding='utf-8') as f_out: # for conversation_obj in all_conversations: # json.dump(conversation_obj, f_out, ensure_ascii=False) # f_out.write('\n') #这个可以查找文件夹里面所有的json文件并过滤 # import json # import os # # # def convert_json_files(input_folder): # all_conversations = [] # special_lines = [] # id = 0 # for root, dirs, files in os.walk(input_folder): # for file_name in files: # if file_name.endswith('.json'): # input_file = os.path.join(root, file_name) # with open(input_file, 'r', encoding='utf-8') as f_in: # for line in f_in: # if "如图所示:" in line: # special_lines.append(line) # continue # if not line.strip(): # continue # try: # conversations = json.loads(line) # id += 1 # conversation_obj = {"id":"identity_" + str(id),"conversations": []} # # for i in range(0, len(conversations), 2): # # if i == 0: # # conversation_obj["conversations"].append({"role": "system","content": ""}) # conversation_obj["conversations"].append( # {"from": "user", "value": conversations[i].lstrip("问:")}) # conversation_obj["conversations"].append( # {"from": "assistant", "value": conversations[i + 1].lstrip("答:")}) # all_conversations.append(conversation_obj) # except json.JSONDecodeError: # print("Error: Invalid JSON format in file:", input_file) # continue # # # Write special lines to a separate JSON file # if special_lines: # special_file = '/Users/yushanghui/hongshantianping/ai训练/data/lineJson/special_file.json' # Define the path for the special file # with open(special_file, 'w', encoding='utf-8') as special_out: # for line in special_lines: # special_out.write(line) # # return all_conversations # # # # 指定输入文件夹 # input_folder = '/Users/yushanghui/hongshantianping/ai训练/data/optimizationData' # # all_conversations = convert_json_files(input_folder) # # # 指定输出文件 # output_file = '/Users/yushanghui/hongshantianping/ai训练/data/lineJson/hongshanData_water.json' # # # 将所有对话写入输出文件 # # with open(output_file, 'w', encoding='utf-8') as f_out: # json.dump(all_conversations, f_out, ensure_ascii=False, indent=4) # 输出到一个集合里面 并且格式化 # import json # import os # def convert_json_files(input_folder): # all_conversations = [] # for file_name in os.listdir(input_folder): # if file_name.endswith('.json'): # input_file = os.path.join(input_folder, file_name) # with open(input_file, 'r', encoding='utf-8') as f_in: # for line in f_in: # try: # conversations = json.loads(line) # conversation_obj = {"conversations": []} # for i in range(0, len(conversations), 2): # if i == 0: # conversation_obj["conversations"].append({"role": "system", "content": "假设你是一个污水处理厂技术专家,你要回答用户询问的问题."}) # conversation_obj["conversations"].append({"role": "user", "content": conversations[i].lstrip("问:")}) # conversation_obj["conversations"].append({"role": "assistant", "content": conversations[i+1].lstrip("答:")}) # all_conversations.append(conversation_obj) # except json.JSONDecodeError: # print("Error: Invalid JSON format in file:", input_file) # continue # return all_conversations # # 指定输入文件夹 # input_folder = '/Users/yushanghui/hongshantianping/ai训练/data/jsondata/hebing' # all_conversations = convert_json_files(input_folder) # # 指定输出文件 # output_file = '/Users/yushanghui/hongshantianping/ai训练/data/lineJson/output.json' # # 将所有对话写入输出文件并格式化 # with open(output_file, 'w', encoding='utf-8') as f_out: # json.dump(all_conversations, f_out, ensure_ascii=False, indent=4) # 输出的内容是数组并且每个对象会添加,以后换行 # import json # import os # def convert_json_files(input_folder): # all_conversations = [] # for file_name in os.listdir(input_folder): # if file_name.endswith('.json'): # input_file = os.path.join(input_folder, file_name) # with open(input_file, 'r', encoding='utf-8') as f_in: # for line in f_in: # try: # conversations = json.loads(line) # conversation_obj = {"conversations": []} # for i in range(0, len(conversations), 2): # if i == 0: # conversation_obj["conversations"].append({"role": "system", "content": "假设你是一个污水处理厂技术专家,你要回答用户询问的问题."}) # conversation_obj["conversations"].append({"role": "user", "content": conversations[i].lstrip("问:")}) # conversation_obj["conversations"].append({"role": "assistant", "content": conversations[i+1].lstrip("答:")}) # all_conversations.append(conversation_obj) # except json.JSONDecodeError: # print("Error: Invalid JSON format in file:", input_file) # continue # return all_conversations # # 指定输入文件夹 # input_folder = '/Users/yushanghui/hongshantianping/ai训练/data/jsondata/hebing' # all_conversations = convert_json_files(input_folder) # # 指定输出文件 # output_file = '/Users/yushanghui/hongshantianping/ai训练/data/lineJson/output.json' # # 将所有对话写入输出文件并格式化 # with open(output_file, 'w', encoding='utf-8') as f_out: # f_out.write("[\n") # 开始列表 # for i, conversation_obj in enumerate(all_conversations): # json.dump(conversation_obj, f_out, ensure_ascii=False) # # f_out.write('\n') # 每个对话对象后添加换行符 # if i < len(all_conversations) - 1: # f_out.write(",\n") # 在除了最后一个对话对象之后添加逗号和换行符 # f_out.write("]\n") # 结束列表 # qwen训练样本 import json import os import random import re # 替换图片为mackdown地址 def replace_image_tags(text): # 匹配包含和不包含 '.' 的情况 pattern = r'@([^@]+?)(\.(jpg|jpeg|png|gif)|jpg|jpeg|png|gif)@\$\s*' # twoPattern = r'@[A-Z0-9.]+@\$\s*' # 替换模板,根据是否匹配到 '.' 动态决定是否添加 def replace_template(match): filename = match.group(1) extension = match.group(2) if not extension.startswith('.'): extension = '.' + extension return f'![{filename}](https://static.fuxicarbon.com/modelData/{filename}{extension})' # 使用 re.sub 进行替换 new_text = re.sub(pattern, replace_template, text) # 使用 re.sub 进行替换 # result = re.sub(twoPattern, '', new_text) return new_text def convert_image_format(text): # 第一个正则表达式 pattern1 = r'@([^@]+?)(\.(jpg|jpeg|png|gif)|jpg|jpeg|png|gif)?@[\$\$]' # 第二个正则表达式 pattern2 = r'@([^@]+?)(?:\.(jpg|jpeg|png|gif))?@[\$\$]' # 替换模板函数,根据是否匹配到扩展名动态决定替换内容 def replace_template(match): filename = match.group(1) extension = match.group(3) if match.group(3) else match.group(2) if extension: # 如果没有以 '.' 开头的扩展名,前面加上 '.' if not extension.startswith('.'): extension = '.' + extension else: extension = '' return f'![{filename}](https://static.fuxicarbon.com/modelData/{filename}{extension})' # 使用第一个正则表达式进行替换 new_text = re.sub(pattern1, replace_template, text) # 使用第二个正则表达式进行替换 new_text = re.sub(pattern2, replace_template, new_text) return new_text def convert_json_files_sharegpt(input_folder): all_conversations = [] special_lines = [] for root, dirs, files in os.walk(input_folder): for file_name in files: if file_name.endswith('.json'): input_file = os.path.join(root, file_name) with open(input_file, 'r', encoding='utf-8') as f_in: for line in f_in: if "如图所示:" in line: special_lines.append(line) continue if not line.strip(): continue try: conversations = json.loads(line) conversation_obj = {"conversations": []} for i in range(0, len(conversations), 2): # if i == 0: # conversation_obj["conversations"].append({"system": "你是信义污水厂助手。"}) conversation_obj["conversations"].append( {"from": "human", "value": conversations[i].lstrip("问:")}) conversation_obj["conversations"].append( {"from": "gpt", "value": conversations[i + 1].lstrip("答:")}) all_conversations.append(conversation_obj) except json.JSONDecodeError: print("Error: Invalid JSON format in file:", input_file) continue # Write special lines to a separate JSON file if special_lines: special_file = '/Users/yushanghui/hongshantianping/ai训练/data/lineJson/special_file.json' # Define the path for the special file with open(special_file, 'w', encoding='utf-8') as special_out: for line in special_lines: special_out.write(line) return all_conversations def formatSharegpt (): # 指定输入文件夹 input_folder = './book/2024.5.13/' all_conversations = convert_json_files_sharegpt(input_folder) # 指定输出文件 output_file = './book/shareGpt/2024.5.13/shareGpt.json' # 确保输出目录存在 output_dir = os.path.dirname(output_file) os.makedirs(output_dir, exist_ok=True) # 将所有对话写入输出文件 随机打乱数据 # random.shuffle(all_conversations) # 将所有对话写入输出文件 with open(output_file, 'w', encoding='utf-8') as f_out: json.dump(all_conversations, f_out, ensure_ascii=False, indent=4) # with open(output_file, 'w', encoding='utf-8') as f_out: # for conversation_obj in all_conversations: # json.dump(conversation_obj, f_out, ensure_ascii=False) # f_out.write('\n') def convert_json_files(input_folder): all_conversations = [] # 获取文件夹名称 folder_name = os.path.basename(os.path.normpath(input_folder)) for root, dirs, files in os.walk(input_folder): for file_name in files: #todo 条件表达式扩展名是json并且文件名匹配 good、reg_lines、short才符合条件 if file_name in ["good.json", "reg_lines.json", "short.json"]: input_file = os.path.join(root, file_name) with open(input_file, 'r', encoding='utf-8') as f_in: for line in f_in: if not line.strip(): continue try: conversations = json.loads(line) conversation_obj = { "id":f"{folder_name}_{len(all_conversations)}", "instruction": "", "input": "", "output": "", "history": [] } for i in range(0, len(conversations), 2): # if i == 0: # conversation_obj["conversations"].append({"role": "system","content": ""}) question = conversations[i].lstrip("问:").strip().replace(" ", "").replace(" ", "") answer = conversations[i + 1].lstrip("答:").strip().replace(" ", "").replace(" ", "") # match = re.search(r'times:(.*)', question) # if match: # conversation_obj["time"] = match.group(1) # question = re.sub(r'times:.*', '', question) question = convert_image_format(question) answer = convert_image_format(answer) if len(conversations)==2: conversation_obj["instruction"]=question conversation_obj["output"]= answer elif len(conversations)>2: # todo 如果当前 i小于len(conversations)的最后两个执行下面的操作 if i < len(conversations)-2: history = [] history.append(question) history.append(answer) conversation_obj["history"].append(history) # todo 如果当前 i等于len(conversations)的最后两个元素执行下面的操作 elif i == len(conversations) - 2: # 倒数第二个对话 conversation_obj["instruction"]=question conversation_obj["output"]= answer all_conversations.append(conversation_obj) except json.JSONDecodeError: print("Error: Invalid JSON format in file:", input_file) continue return all_conversations # 格式化训练数据为alpace格式 def formatAlpaca (input_folder,output_file): all_conversations = convert_json_files(input_folder) print(len(all_conversations)) # 将所有对话写入输出文件 随机打乱数据 # random.shuffle(all_conversations) with open(output_file, 'w', encoding='utf-8') as f_out: json.dump(all_conversations, f_out, ensure_ascii=False, indent=4) # 转成不是集合 都是一行一行的json对象 # with open(output_file, 'w', encoding='utf-8') as f_out: # for conversation_obj in all_conversations: # json.dump(conversation_obj, f_out, ensure_ascii=False) # f_out.write('\n') def formatFlatten(input_folder): all_conversations = [] for root, dirs, files in os.walk(input_folder): for file_name in files: if file_name.endswith('.json'): input_file = os.path.join(root, file_name) with open(input_file, 'r', encoding='utf-8') as f_in: for line in f_in: if not line.strip(): continue try: conversations = json.loads(line) for i in range(0, len(conversations), 2): if "信义污水厂" not in conversations[i]: conversations[i] = conversations[i].replace("问:", "问:信义污水厂的") all_conversations.append([conversations[i],conversations[i + 1]]) except json.JSONDecodeError: print("Error: Invalid JSON format in file:", input_file) continue # Write special lines to a separate JSON file if all_conversations: special_file = './excel/huaxiang/huaxiang_2024.5.21.json' # Define the path for the special file with open(special_file, 'w', encoding='utf-8') as f_out: for conversation_obj in all_conversations: json.dump(conversation_obj, f_out, ensure_ascii=False) f_out.write('\n') if __name__ == "__main__": input_folder = './book/jiejin/' output_file = './book/alpace/jiejin.json' formatAlpaca(input_folder,output_file)