123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424 |
- #查找当前文件所有json文件 输出到一个文件夹每个会话对象换行不在一个集合里面
- # import json
- # import os
- #
- # def convert_json_files(input_folder):
- # all_conversations = []
- # for root, dirs, files in os.walk(input_folder):
- # for file_name in files:
- # if file_name.endswith('.json'):
- # input_file = os.path.join(root, file_name)
- # with open(input_file, 'r', encoding='utf-8') as f_in:
- # for line in f_in:
- # if not line.strip():
- # continue
- # try:
- # conversations = json.loads(line)
- # conversation_obj = {"conversations": []}
- # for i in range(0, len(conversations), 2):
- # if i == 0:
- # conversation_obj["conversations"].append({"role": "system", "content": "你是污水处理厂技术专家,针对用户提出的问题,提供专业见解和建议并且清晰有条理,解决用户提出的问题"})
- # conversation_obj["conversations"].append({"role": "user", "content": conversations[i].lstrip("问:")})
- # conversation_obj["conversations"].append({"role": "assistant", "content": conversations[i+1].lstrip("答:")})
- # all_conversations.append(conversation_obj)
- # except json.JSONDecodeError:
- # print("Error: Invalid JSON format in file:", input_file)
- # continue
- # return all_conversations
- #
- # #指定输入文件夹
- # input_folder = '/Users/yushanghui/hongshantianping/ai训练/data/optimizationData'
- #
- # all_conversations = convert_json_files(input_folder)
- #
- # # # 指定输出文件
- # output_file = '/Users/yushanghui/hongshantianping/ai训练/data/lineJson/2024.4.09.json'
- #
- # # 将所有对话写入输出文件
- # with open(output_file, 'w', encoding='utf-8') as f_out:
- # for conversation_obj in all_conversations:
- # json.dump(conversation_obj, f_out, ensure_ascii=False)
- # f_out.write('\n')
- #这个可以查找文件夹里面所有的json文件并过滤
- # import json
- # import os
- #
- #
- # def convert_json_files(input_folder):
- # all_conversations = []
- # special_lines = []
- # id = 0
- # for root, dirs, files in os.walk(input_folder):
- # for file_name in files:
- # if file_name.endswith('.json'):
- # input_file = os.path.join(root, file_name)
- # with open(input_file, 'r', encoding='utf-8') as f_in:
- # for line in f_in:
- # if "如图所示:" in line:
- # special_lines.append(line)
- # continue
- # if not line.strip():
- # continue
- # try:
- # conversations = json.loads(line)
- # id += 1
- # conversation_obj = {"id":"identity_" + str(id),"conversations": []}
- #
- # for i in range(0, len(conversations), 2):
- # # if i == 0:
- # # conversation_obj["conversations"].append({"role": "system","content": ""})
- # conversation_obj["conversations"].append(
- # {"from": "user", "value": conversations[i].lstrip("问:")})
- # conversation_obj["conversations"].append(
- # {"from": "assistant", "value": conversations[i + 1].lstrip("答:")})
- # all_conversations.append(conversation_obj)
- # except json.JSONDecodeError:
- # print("Error: Invalid JSON format in file:", input_file)
- # continue
- #
- # # Write special lines to a separate JSON file
- # if special_lines:
- # special_file = '/Users/yushanghui/hongshantianping/ai训练/data/lineJson/special_file.json' # Define the path for the special file
- # with open(special_file, 'w', encoding='utf-8') as special_out:
- # for line in special_lines:
- # special_out.write(line)
- #
- # return all_conversations
- #
- #
- # # 指定输入文件夹
- # input_folder = '/Users/yushanghui/hongshantianping/ai训练/data/optimizationData'
- #
- # all_conversations = convert_json_files(input_folder)
- #
- # # 指定输出文件
- # output_file = '/Users/yushanghui/hongshantianping/ai训练/data/lineJson/hongshanData_water.json'
- #
- # # 将所有对话写入输出文件
- #
- # with open(output_file, 'w', encoding='utf-8') as f_out:
- # json.dump(all_conversations, f_out, ensure_ascii=False, indent=4)
- # 输出到一个集合里面 并且格式化
- # import json
- # import os
- # def convert_json_files(input_folder):
- # all_conversations = []
- # for file_name in os.listdir(input_folder):
- # if file_name.endswith('.json'):
- # input_file = os.path.join(input_folder, file_name)
- # with open(input_file, 'r', encoding='utf-8') as f_in:
- # for line in f_in:
- # try:
- # conversations = json.loads(line)
- # conversation_obj = {"conversations": []}
- # for i in range(0, len(conversations), 2):
- # if i == 0:
- # conversation_obj["conversations"].append({"role": "system", "content": "假设你是一个污水处理厂技术专家,你要回答用户询问的问题."})
- # conversation_obj["conversations"].append({"role": "user", "content": conversations[i].lstrip("问:")})
- # conversation_obj["conversations"].append({"role": "assistant", "content": conversations[i+1].lstrip("答:")})
- # all_conversations.append(conversation_obj)
- # except json.JSONDecodeError:
- # print("Error: Invalid JSON format in file:", input_file)
- # continue
- # return all_conversations
- # # 指定输入文件夹
- # input_folder = '/Users/yushanghui/hongshantianping/ai训练/data/jsondata/hebing'
- # all_conversations = convert_json_files(input_folder)
- # # 指定输出文件
- # output_file = '/Users/yushanghui/hongshantianping/ai训练/data/lineJson/output.json'
- # # 将所有对话写入输出文件并格式化
- # with open(output_file, 'w', encoding='utf-8') as f_out:
- # json.dump(all_conversations, f_out, ensure_ascii=False, indent=4)
- # 输出的内容是数组并且每个对象会添加,以后换行
- # import json
- # import os
- # def convert_json_files(input_folder):
- # all_conversations = []
- # for file_name in os.listdir(input_folder):
- # if file_name.endswith('.json'):
- # input_file = os.path.join(input_folder, file_name)
- # with open(input_file, 'r', encoding='utf-8') as f_in:
- # for line in f_in:
- # try:
- # conversations = json.loads(line)
- # conversation_obj = {"conversations": []}
- # for i in range(0, len(conversations), 2):
- # if i == 0:
- # conversation_obj["conversations"].append({"role": "system", "content": "假设你是一个污水处理厂技术专家,你要回答用户询问的问题."})
- # conversation_obj["conversations"].append({"role": "user", "content": conversations[i].lstrip("问:")})
- # conversation_obj["conversations"].append({"role": "assistant", "content": conversations[i+1].lstrip("答:")})
- # all_conversations.append(conversation_obj)
- # except json.JSONDecodeError:
- # print("Error: Invalid JSON format in file:", input_file)
- # continue
- # return all_conversations
- # # 指定输入文件夹
- # input_folder = '/Users/yushanghui/hongshantianping/ai训练/data/jsondata/hebing'
- # all_conversations = convert_json_files(input_folder)
- # # 指定输出文件
- # output_file = '/Users/yushanghui/hongshantianping/ai训练/data/lineJson/output.json'
- # # 将所有对话写入输出文件并格式化
- # with open(output_file, 'w', encoding='utf-8') as f_out:
- # f_out.write("[\n") # 开始列表
- # for i, conversation_obj in enumerate(all_conversations):
- # json.dump(conversation_obj, f_out, ensure_ascii=False)
- # # f_out.write('\n') # 每个对话对象后添加换行符
- # if i < len(all_conversations) - 1:
- # f_out.write(",\n") # 在除了最后一个对话对象之后添加逗号和换行符
- # f_out.write("]\n") # 结束列表
- # qwen训练样本
- import json
- import os
- import random
- import re
- # 替换图片为mackdown地址
- def replace_image_tags(text):
- # 匹配包含和不包含 '.' 的情况
- pattern = r'@([^@]+?)(\.(jpg|jpeg|png|gif)|jpg|jpeg|png|gif)@\$\s*'
-
- # twoPattern = r'@[A-Z0-9.]+@\$\s*'
-
-
- # 替换模板,根据是否匹配到 '.' 动态决定是否添加
- def replace_template(match):
- filename = match.group(1)
- extension = match.group(2)
- if not extension.startswith('.'):
- extension = '.' + extension
- return f'![{filename}](https://static.fuxicarbon.com/modelData/{filename}{extension})'
-
- # 使用 re.sub 进行替换
- new_text = re.sub(pattern, replace_template, text)
- # 使用 re.sub 进行替换
- # result = re.sub(twoPattern, '', new_text)
- return new_text
- def convert_image_format(text):
- # 第一个正则表达式
- pattern1 = r'@([^@]+?)(\.(jpg|jpeg|png|gif)|jpg|jpeg|png|gif)?@[\$\$]'
- # 第二个正则表达式
- pattern2 = r'@([^@]+?)(?:\.(jpg|jpeg|png|gif))?@[\$\$]'
-
- # 替换模板函数,根据是否匹配到扩展名动态决定替换内容
- def replace_template(match):
- filename = match.group(1)
- extension = match.group(3) if match.group(3) else match.group(2)
- if extension:
- # 如果没有以 '.' 开头的扩展名,前面加上 '.'
- if not extension.startswith('.'):
- extension = '.' + extension
- else:
- extension = ''
- return f'![{filename}](https://static.fuxicarbon.com/modelData/{filename}{extension})'
-
- # 使用第一个正则表达式进行替换
- new_text = re.sub(pattern1, replace_template, text)
- # 使用第二个正则表达式进行替换
- new_text = re.sub(pattern2, replace_template, new_text)
-
- return new_text
- def convert_json_files_sharegpt(input_folder):
- all_conversations = []
- special_lines = []
- for root, dirs, files in os.walk(input_folder):
- for file_name in files:
- if file_name.endswith('.json'):
- input_file = os.path.join(root, file_name)
- with open(input_file, 'r', encoding='utf-8') as f_in:
- for line in f_in:
- if "如图所示:" in line:
- special_lines.append(line)
- continue
- if not line.strip():
- continue
- try:
- conversations = json.loads(line)
- conversation_obj = {"conversations": []}
- for i in range(0, len(conversations), 2):
- # if i == 0:
- # conversation_obj["conversations"].append({"system": "你是信义污水厂助手。"})
- conversation_obj["conversations"].append(
- {"from": "human", "value": conversations[i].lstrip("问:")})
- conversation_obj["conversations"].append(
- {"from": "gpt", "value": conversations[i + 1].lstrip("答:")})
- all_conversations.append(conversation_obj)
- except json.JSONDecodeError:
- print("Error: Invalid JSON format in file:", input_file)
- continue
- # Write special lines to a separate JSON file
- if special_lines:
- special_file = '/Users/yushanghui/hongshantianping/ai训练/data/lineJson/special_file.json' # Define the path for the special file
- with open(special_file, 'w', encoding='utf-8') as special_out:
- for line in special_lines:
- special_out.write(line)
- return all_conversations
- def formatSharegpt ():
- # 指定输入文件夹
- input_folder = './book/2024.5.13/'
- all_conversations = convert_json_files_sharegpt(input_folder)
- # 指定输出文件
- output_file = './book/shareGpt/2024.5.13/shareGpt.json'
-
- # 确保输出目录存在
- output_dir = os.path.dirname(output_file)
- os.makedirs(output_dir, exist_ok=True)
-
- # 将所有对话写入输出文件 随机打乱数据
- # random.shuffle(all_conversations)
- # 将所有对话写入输出文件
- with open(output_file, 'w', encoding='utf-8') as f_out:
- json.dump(all_conversations, f_out, ensure_ascii=False, indent=4)
- # with open(output_file, 'w', encoding='utf-8') as f_out:
- # for conversation_obj in all_conversations:
- # json.dump(conversation_obj, f_out, ensure_ascii=False)
- # f_out.write('\n')
- def convert_json_files(input_folder):
- all_conversations = []
- # 获取文件夹名称
- folder_name = os.path.basename(os.path.normpath(input_folder))
-
- for root, dirs, files in os.walk(input_folder):
- for file_name in files:
- #todo 条件表达式扩展名是json并且文件名匹配 good、reg_lines、short才符合条件
- if file_name in ["good.json", "reg_lines.json", "short.json"]:
- input_file = os.path.join(root, file_name)
- with open(input_file, 'r', encoding='utf-8') as f_in:
- for line in f_in:
- if not line.strip():
- continue
- try:
- conversations = json.loads(line)
- conversation_obj = {
- "id":f"{folder_name}_{len(all_conversations)}",
- "instruction": "",
- "input": "",
- "output": "",
- "history": []
- }
- for i in range(0, len(conversations), 2):
- # if i == 0:
- # conversation_obj["conversations"].append({"role": "system","content": ""})
- question = conversations[i].lstrip("问:").strip().replace(" ", "").replace(" ", "")
- answer = conversations[i + 1].lstrip("答:").strip().replace(" ", "").replace(" ", "")
- # match = re.search(r'times:(.*)', question)
- # if match:
- # conversation_obj["time"] = match.group(1)
- # question = re.sub(r'times:.*', '', question)
- question = convert_image_format(question)
- answer = convert_image_format(answer)
-
- if len(conversations)==2:
- conversation_obj["instruction"]=question
- conversation_obj["output"]= answer
- elif len(conversations)>2:
- # todo 如果当前 i小于len(conversations)的最后两个执行下面的操作
- if i < len(conversations)-2:
- history = []
- history.append(question)
- history.append(answer)
- conversation_obj["history"].append(history)
- # todo 如果当前 i等于len(conversations)的最后两个元素执行下面的操作
- elif i == len(conversations) - 2: # 倒数第二个对话
- conversation_obj["instruction"]=question
- conversation_obj["output"]= answer
- all_conversations.append(conversation_obj)
-
- except json.JSONDecodeError:
- print("Error: Invalid JSON format in file:", input_file)
- continue
- return all_conversations
- # 格式化训练数据为alpace格式
- def formatAlpaca (input_folder,output_file):
-
- all_conversations = convert_json_files(input_folder)
- print(len(all_conversations))
-
-
- # 将所有对话写入输出文件 随机打乱数据
- # random.shuffle(all_conversations)
- with open(output_file, 'w', encoding='utf-8') as f_out:
- json.dump(all_conversations, f_out, ensure_ascii=False, indent=4)
-
- # 转成不是集合 都是一行一行的json对象
- # with open(output_file, 'w', encoding='utf-8') as f_out:
- # for conversation_obj in all_conversations:
- # json.dump(conversation_obj, f_out, ensure_ascii=False)
- # f_out.write('\n')
-
-
- def formatFlatten(input_folder):
- all_conversations = []
- for root, dirs, files in os.walk(input_folder):
- for file_name in files:
- if file_name.endswith('.json'):
- input_file = os.path.join(root, file_name)
- with open(input_file, 'r', encoding='utf-8') as f_in:
- for line in f_in:
- if not line.strip():
- continue
- try:
- conversations = json.loads(line)
- for i in range(0, len(conversations), 2):
- if "信义污水厂" not in conversations[i]:
- conversations[i] = conversations[i].replace("问:", "问:信义污水厂的")
- all_conversations.append([conversations[i],conversations[i + 1]])
-
- except json.JSONDecodeError:
- print("Error: Invalid JSON format in file:", input_file)
- continue
- # Write special lines to a separate JSON file
- if all_conversations:
- special_file = './excel/huaxiang/huaxiang_2024.5.21.json' # Define the path for the special file
- with open(special_file, 'w', encoding='utf-8') as f_out:
- for conversation_obj in all_conversations:
- json.dump(conversation_obj, f_out, ensure_ascii=False)
- f_out.write('\n')
-
- if __name__ == "__main__":
- input_folder = './book/jiejin/'
- output_file = './book/alpace/jiejin.json'
- formatAlpaca(input_folder,output_file)
|