ai-model
/
dataTools


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424
							#查找当前文件所有json文件 输出到一个文件夹每个会话对象换行不在一个集合里面
# import json
# import os
#
# def convert_json_files(input_folder):
#     all_conversations = []
#     for root, dirs, files in os.walk(input_folder):
#         for file_name in files:
#             if file_name.endswith('.json'):
#                 input_file = os.path.join(root, file_name)
#                 with open(input_file, 'r', encoding='utf-8') as f_in:
#                     for line in f_in:
#                         if  not line.strip():
#                             continue
#                         try:
#                             conversations = json.loads(line)
#                             conversation_obj = {"conversations": []}
#                             for i in range(0, len(conversations), 2):
#                                 if i == 0:
#                                     conversation_obj["conversations"].append({"role": "system", "content": "你是污水处理厂技术专家，针对用户提出的问题，提供专业见解和建议并且清晰有条理，解决用户提出的问题"})
#                                 conversation_obj["conversations"].append({"role": "user", "content": conversations[i].lstrip("问：")})
#                                 conversation_obj["conversations"].append({"role": "assistant", "content": conversations[i+1].lstrip("答：")})
#                             all_conversations.append(conversation_obj)
#                         except json.JSONDecodeError:
#                             print("Error: Invalid JSON format in file:", input_file)
#                             continue
#     return all_conversations
#
#  #指定输入文件夹
# input_folder = '/Users/yushanghui/hongshantianping/ai训练/data/optimizationData'
#
# all_conversations = convert_json_files(input_folder)
#
# # # 指定输出文件
# output_file = '/Users/yushanghui/hongshantianping/ai训练/data/lineJson/2024.4.09.json'
#
# # 将所有对话写入输出文件
# with open(output_file, 'w', encoding='utf-8') as f_out:
#     for conversation_obj in all_conversations:
#         json.dump(conversation_obj, f_out, ensure_ascii=False)
#         f_out.write('\n')


#这个可以查找文件夹里面所有的json文件并过滤
# import json
# import os
#
#
# def convert_json_files(input_folder):
#     all_conversations = []
#     special_lines = []
#     id = 0
#     for root, dirs, files in os.walk(input_folder):
#         for file_name in files:
#             if file_name.endswith('.json'):
#                 input_file = os.path.join(root, file_name)
#                 with open(input_file, 'r', encoding='utf-8') as f_in:
#                     for line in f_in:
#                         if "如图所示：" in line:
#                             special_lines.append(line)
#                             continue
#                         if not line.strip():
#                             continue
#                         try:
#                             conversations = json.loads(line)
#                             id += 1
#                             conversation_obj = {"id":"identity_" + str(id),"conversations": []}
#
#                             for i in range(0, len(conversations), 2):
#                                 # if i == 0:
#                                 #     conversation_obj["conversations"].append({"role": "system","content": ""})
#                                 conversation_obj["conversations"].append(
#                                     {"from": "user", "value": conversations[i].lstrip("问：")})
#                                 conversation_obj["conversations"].append(
#                                     {"from": "assistant", "value": conversations[i + 1].lstrip("答：")})
#                             all_conversations.append(conversation_obj)
#                         except json.JSONDecodeError:
#                             print("Error: Invalid JSON format in file:", input_file)
#                             continue
#
#     # Write special lines to a separate JSON file
#     if special_lines:
#         special_file = '/Users/yushanghui/hongshantianping/ai训练/data/lineJson/special_file.json'  # Define the path for the special file
#         with open(special_file, 'w', encoding='utf-8') as special_out:
#             for line in special_lines:
#                 special_out.write(line)
#
#     return all_conversations
#
#
# # 指定输入文件夹
# input_folder = '/Users/yushanghui/hongshantianping/ai训练/data/optimizationData'
#
# all_conversations = convert_json_files(input_folder)
#
# # 指定输出文件
# output_file = '/Users/yushanghui/hongshantianping/ai训练/data/lineJson/hongshanData_water.json'
#
# # 将所有对话写入输出文件
#
# with open(output_file, 'w', encoding='utf-8') as f_out:
#     json.dump(all_conversations, f_out, ensure_ascii=False, indent=4)


# 输出到一个集合里面 并且格式化
# import json
# import os

# def convert_json_files(input_folder):
#     all_conversations = []
#     for file_name in os.listdir(input_folder):
#         if file_name.endswith('.json'):
#             input_file = os.path.join(input_folder, file_name)
#             with open(input_file, 'r', encoding='utf-8') as f_in:
#                 for line in f_in:
#                     try:
#                         conversations = json.loads(line)
#                         conversation_obj = {"conversations": []}
#                         for i in range(0, len(conversations), 2):
#                             if i == 0:
#                                 conversation_obj["conversations"].append({"role": "system", "content": "假设你是一个污水处理厂技术专家，你要回答用户询问的问题."})
#                             conversation_obj["conversations"].append({"role": "user", "content": conversations[i].lstrip("问：")})
#                             conversation_obj["conversations"].append({"role": "assistant", "content": conversations[i+1].lstrip("答：")})
#                         all_conversations.append(conversation_obj)
#                     except json.JSONDecodeError:
#                         print("Error: Invalid JSON format in file:", input_file)
#                         continue
#     return all_conversations

# # 指定输入文件夹
# input_folder = '/Users/yushanghui/hongshantianping/ai训练/data/jsondata/hebing'

# all_conversations = convert_json_files(input_folder)

# # 指定输出文件
# output_file = '/Users/yushanghui/hongshantianping/ai训练/data/lineJson/output.json'

# # 将所有对话写入输出文件并格式化
# with open(output_file, 'w', encoding='utf-8') as f_out:
#     json.dump(all_conversations, f_out, ensure_ascii=False, indent=4)


# 输出的内容是数组并且每个对象会添加,以后换行
# import json
# import os

# def convert_json_files(input_folder):
#     all_conversations = []
#     for file_name in os.listdir(input_folder):
#         if file_name.endswith('.json'):
#             input_file = os.path.join(input_folder, file_name)
#             with open(input_file, 'r', encoding='utf-8') as f_in:
#                 for line in f_in:
#                     try:
#                         conversations = json.loads(line)
#                         conversation_obj = {"conversations": []}
#                         for i in range(0, len(conversations), 2):
#                             if i == 0:
#                                 conversation_obj["conversations"].append({"role": "system", "content": "假设你是一个污水处理厂技术专家，你要回答用户询问的问题."})
#                             conversation_obj["conversations"].append({"role": "user", "content": conversations[i].lstrip("问：")})
#                             conversation_obj["conversations"].append({"role": "assistant", "content": conversations[i+1].lstrip("答：")})
#                         all_conversations.append(conversation_obj)
#                     except json.JSONDecodeError:
#                         print("Error: Invalid JSON format in file:", input_file)
#                         continue
#     return all_conversations

# # 指定输入文件夹
# input_folder = '/Users/yushanghui/hongshantianping/ai训练/data/jsondata/hebing'

# all_conversations = convert_json_files(input_folder)

# # 指定输出文件
# output_file = '/Users/yushanghui/hongshantianping/ai训练/data/lineJson/output.json'

# # 将所有对话写入输出文件并格式化
# with open(output_file, 'w', encoding='utf-8') as f_out:
#     f_out.write("[\n")  # 开始列表
#     for i, conversation_obj in enumerate(all_conversations):
#         json.dump(conversation_obj, f_out, ensure_ascii=False)
#         # f_out.write('\n')  # 每个对话对象后添加换行符
#         if i < len(all_conversations) - 1:
#             f_out.write(",\n")  # 在除了最后一个对话对象之后添加逗号和换行符
#     f_out.write("]\n")  # 结束列表


# qwen训练样本
import json
import os
import random
import re

# 替换图片为mackdown地址
def replace_image_tags(text):
    # 匹配包含和不包含 '.' 的情况
    pattern = r'@([^@]+?)(\.(jpg|jpeg|png|gif)|jpg|jpeg|png|gif)@\$\s*'
    
    # twoPattern = r'@[A-Z0-9.]+@\$\s*'
    
   
    # 替换模板，根据是否匹配到 '.' 动态决定是否添加
    def replace_template(match):
        filename = match.group(1)
        extension = match.group(2)
        if not extension.startswith('.'):
            extension = '.' + extension
        return f'![{filename}](https://static.fuxicarbon.com/modelData/{filename}{extension})'
    
    # 使用 re.sub 进行替换
    new_text = re.sub(pattern, replace_template, text)
     # 使用 re.sub 进行替换
    # result = re.sub(twoPattern, '', new_text)
    return new_text


def convert_image_format(text):
     # 第一个正则表达式
    pattern1 = r'@([^@]+?)(\.(jpg|jpeg|png|gif)|jpg|jpeg|png|gif)?@[\$\＄]'
    # 第二个正则表达式
    pattern2 = r'@([^@]+?)(?:\.(jpg|jpeg|png|gif))?@[\$\＄]'
    
    # 替换模板函数，根据是否匹配到扩展名动态决定替换内容
    def replace_template(match):
        filename = match.group(1)
        extension = match.group(3) if match.group(3) else match.group(2)
        if extension:
            # 如果没有以 '.' 开头的扩展名，前面加上 '.'
            if not extension.startswith('.'):
                extension = '.' + extension
        else:
            extension = ''
        return f'![{filename}](https://static.fuxicarbon.com/modelData/{filename}{extension})'
    
    # 使用第一个正则表达式进行替换
    new_text = re.sub(pattern1, replace_template, text)
    # 使用第二个正则表达式进行替换
    new_text = re.sub(pattern2, replace_template, new_text)
    
    return new_text

def convert_json_files_sharegpt(input_folder):
    all_conversations = []
    special_lines = []
    for root, dirs, files in os.walk(input_folder):
        for file_name in files:
            if file_name.endswith('.json'):
                input_file = os.path.join(root, file_name)
                with open(input_file, 'r', encoding='utf-8') as f_in:
                    for line in f_in:
                        if "如图所示：" in line:
                            special_lines.append(line)
                            continue
                        if not line.strip():
                            continue
                        try:
                            conversations = json.loads(line)
                            conversation_obj = {"conversations": []}
                            for i in range(0, len(conversations), 2):
                                # if i == 0:
                                #     conversation_obj["conversations"].append({"system": "你是信义污水厂助手。"})
                                conversation_obj["conversations"].append(
                                    {"from": "human", "value": conversations[i].lstrip("问：")})
                                conversation_obj["conversations"].append(
                                    {"from": "gpt", "value": conversations[i + 1].lstrip("答：")})

                            all_conversations.append(conversation_obj)
                        except json.JSONDecodeError:
                            print("Error: Invalid JSON format in file:", input_file)
                            continue

    # Write special lines to a separate JSON file
    if special_lines:
        special_file = '/Users/yushanghui/hongshantianping/ai训练/data/lineJson/special_file.json'  # Define the path for the special file
        with open(special_file, 'w', encoding='utf-8') as special_out:
            for line in special_lines:
                special_out.write(line)

    return all_conversations


def formatSharegpt ():
    # 指定输入文件夹
    input_folder = './book/2024.5.13/'

    all_conversations = convert_json_files_sharegpt(input_folder)

    # 指定输出文件
    output_file = './book/shareGpt/2024.5.13/shareGpt.json'
    
     # 确保输出目录存在
    output_dir = os.path.dirname(output_file)
    os.makedirs(output_dir, exist_ok=True)
    
    # 将所有对话写入输出文件 随机打乱数据
    # random.shuffle(all_conversations)

    # 将所有对话写入输出文件

    with open(output_file, 'w', encoding='utf-8') as f_out:
        json.dump(all_conversations, f_out, ensure_ascii=False, indent=4)

    # with open(output_file, 'w', encoding='utf-8') as f_out:
    #     for conversation_obj in all_conversations:
    #         json.dump(conversation_obj, f_out, ensure_ascii=False)
    #         f_out.write('\n')


def convert_json_files(input_folder):
    all_conversations = []
    # 获取文件夹名称
    folder_name = os.path.basename(os.path.normpath(input_folder))
    
    for root, dirs, files in os.walk(input_folder):
        for file_name in files:
            #todo 条件表达式扩展名是json并且文件名匹配 good、reg_lines、short才符合条件
            if file_name in ["good.json", "reg_lines.json", "short.json"]:
                input_file = os.path.join(root, file_name)
                with open(input_file, 'r', encoding='utf-8') as f_in:
                    for line in f_in:
                        if not line.strip():
                            continue
                        try:
                            conversations = json.loads(line)
                            conversation_obj = {
                                "id":f"{folder_name}_{len(all_conversations)}",
                                "instruction": "",
                                "input": "",
                                "output": "",
                                "history": []
                            }
                            for i in range(0, len(conversations), 2):
                                # if i == 0:
                                #     conversation_obj["conversations"].append({"role": "system","content": ""})
                                question = conversations[i].lstrip("问：").strip().replace(" ", "").replace(" ", "")
                                answer = conversations[i + 1].lstrip("答：").strip().replace(" ", "").replace(" ", "")
                                # match = re.search(r'times：(.*)', question)
                                # if match:
                                #     conversation_obj["time"] = match.group(1)
                                #     question = re.sub(r'times：.*', '', question)
                                question = convert_image_format(question)
                                answer = convert_image_format(answer)
                                    
                                if len(conversations)==2:
                                    conversation_obj["instruction"]=question
                                    conversation_obj["output"]= answer
                                elif len(conversations)>2:
                                    # todo 如果当前 i小于len(conversations)的最后两个执行下面的操作
                                    if i < len(conversations)-2:
                                        history = []
                                        history.append(question)
                                        history.append(answer)
                                        conversation_obj["history"].append(history)
                                    # todo 如果当前 i等于len(conversations)的最后两个元素执行下面的操作
                                    elif i == len(conversations) - 2:  # 倒数第二个对话
                                        conversation_obj["instruction"]=question
                                        conversation_obj["output"]= answer
                            all_conversations.append(conversation_obj)
                                        
                        except json.JSONDecodeError:
                            print("Error: Invalid JSON format in file:", input_file)
                            continue
    return all_conversations

# 格式化训练数据为alpace格式
def formatAlpaca (input_folder,output_file):
    
    all_conversations = convert_json_files(input_folder)
    print(len(all_conversations))
   
        
    # 将所有对话写入输出文件 随机打乱数据
    # random.shuffle(all_conversations)
    with open(output_file, 'w', encoding='utf-8') as f_out:
        json.dump(all_conversations, f_out, ensure_ascii=False, indent=4)
        

    # 转成不是集合 都是一行一行的json对象
    # with open(output_file, 'w', encoding='utf-8') as f_out:
    #     for conversation_obj in all_conversations:
    #         json.dump(conversation_obj, f_out, ensure_ascii=False)
    #         f_out.write('\n')
    
    
def formatFlatten(input_folder):
    all_conversations = []
    for root, dirs, files in os.walk(input_folder):
        for file_name in files:
            if file_name.endswith('.json'):
                input_file = os.path.join(root, file_name)
                with open(input_file, 'r', encoding='utf-8') as f_in:
                    for line in f_in:
                        if not line.strip():
                            continue
                        try:
                            conversations = json.loads(line)
                            for i in range(0, len(conversations), 2):
                                if "信义污水厂" not in conversations[i]:
                                    conversations[i] = conversations[i].replace("问：", "问：信义污水厂的")
                                all_conversations.append([conversations[i],conversations[i + 1]])
                                        
                        except json.JSONDecodeError:
                            print("Error: Invalid JSON format in file:", input_file)
                            continue

    # Write special lines to a separate JSON file
    if all_conversations:
        special_file = './excel/huaxiang/huaxiang_2024.5.21.json'  # Define the path for the special file
        with open(special_file, 'w', encoding='utf-8') as f_out:
            for conversation_obj in all_conversations:
                json.dump(conversation_obj, f_out, ensure_ascii=False)
                f_out.write('\n')
                    

if __name__ == "__main__": 
    input_folder = './book/jiejin/'
    output_file = './book/alpace/jiejin.json'
    formatAlpaca(input_folder,output_file)