余尚辉 vor 10 Monaten
Commit
acedf0d4ef
7 geänderte Dateien mit 791 neuen und 0 gelöschten Zeilen
  1. BIN
      __pycache__/random.cpython-38.pyc
  2. 77 0
      checkBad.py
  3. 105 0
      excel.py
  4. 190 0
      excelToJson.py
  5. 331 0
      formatData.py
  6. 60 0
      jiexiExcel.py
  7. 28 0
      merge.py

BIN
__pycache__/random.cpython-38.pyc


+ 77 - 0
checkBad.py

@@ -0,0 +1,77 @@
+import json
+
+def convert_json_files(input_file):
+    filter_lines = []
+    bad_lines = []
+    all_wen = set()  # 使用集合存储已经出现的元素,以提高查找效率
+    seen_items = []  # 用于记录重复出现的条目
+    short_lines = []  # 存储比较短的集合
+    
+    with open(input_file, 'r', encoding='utf-8') as f_in:
+        for item in f_in:
+            # 解析JSON字符串
+            try:
+                json_obj = json.loads(item)
+            except json.JSONDecodeError:
+                # JSON解析错误,将其视为"坏行"
+                bad_lines.append(item)
+                continue
+            
+            # 获取JSON对象的第一个元素
+            first_element = json_obj[0] if json_obj else None
+            
+            if first_element is None:
+                # 如果JSON对象为空,将其视为"坏行"
+                bad_lines.append(item)
+                continue
+            
+            if "如图所示" in item or "详见图" in item or "[耿春女,高阳俊,李丹 主编]" in item :
+                # 如果第一个元素包含"如图所示",将其视为"坏行"
+                bad_lines.append(item)
+                continue
+            
+            if first_element in all_wen:
+                # 如果第一个元素已经出现过,将其视为重复出现的条目
+                seen_items.append(item)
+                continue
+            
+            if len(item) < 100:
+                # 如果集合长度小于100,将其视为比较短的集合
+                short_lines.append(item)
+                continue
+            
+            # 添加第一个元素到已经出现的元素集合中
+            all_wen.add(first_element)
+            filter_lines.append(item)
+
+    return filter_lines, bad_lines, seen_items, short_lines
+
+# 指定输入文件
+input_file = '/Users/yushanghui/Desktop/比较接近水务的书籍/公众号/all_out.json'
+# 指定输出文件
+bad_file = '/Users/yushanghui/Desktop/比较接近水务的书籍/公众号/bad.json'
+filter_file = '/Users/yushanghui/Desktop/比较接近水务的书籍/公众号/good.json'
+loop_file = '/Users/yushanghui/Desktop/比较接近水务的书籍/公众号/loop.json'
+short_file = '/Users/yushanghui/Desktop/比较接近水务的书籍/公众号/short.json'
+# 调用函数进行转换
+filter_lines, bad_lines, seen_items, short_lines = convert_json_files(input_file)
+
+# 写入 good.json
+with open(filter_file, 'w', encoding='utf-8') as f_out:
+    for line in filter_lines:
+        f_out.write(line)
+
+# 写入 bad.json
+with open(bad_file, 'w', encoding='utf-8') as f_out:
+    for line in bad_lines:
+        f_out.write(line)
+
+# 写入 loop.json
+with open(loop_file, 'w', encoding='utf-8') as f_out:
+    for line in seen_items:
+        f_out.write(line)
+
+# 写入 short.json
+with open(short_file, 'w', encoding='utf-8') as f_out:
+    for line in short_lines:
+        f_out.write(line)

+ 105 - 0
excel.py

@@ -0,0 +1,105 @@
+import pandas as pd
+import json
+import re
+import random
+import os
+
+def replace_numbers(text):
+    pattern = r'(当前值\s*)(\d+(\.\d+)?)'  # 匹配"当前的值"后面的数字
+
+    def repl(match):
+        num_str = match.group(2)
+        num = float(num_str)
+        
+        if '.' in num_str:
+            random_offset = round(random.uniform(-0.001, 0.05), 2)
+        elif len(num_str) == 1:
+            random_offset = round(random.uniform(-0.05, 0.05), 2)
+        elif len(num_str) == 2:
+            random_offset = round(random.uniform(-1.00, 1.00), 2)
+        else:
+            random_offset = round(random.uniform(-2, 2), 2)
+        
+        new_num = num + random_offset
+        formatted_num = '{:.2f}'.format(new_num)
+        return match.group(1) + formatted_num
+
+    new_text = re.sub(pattern, repl, text)
+    return new_text
+
+def excel_to_json(excel_file_path):
+    excel_data = pd.ExcelFile(excel_file_path)
+    sheet_names = excel_data.sheet_names
+    
+    json_data = {}
+    all_data = []
+    
+    for sheet_name in sheet_names:
+        sheet_data = []
+        df_sheet = pd.read_excel(excel_data, sheet_name=sheet_name)
+        
+        
+        for _ in range(12):
+            for _, row in df_sheet.iterrows():
+                conversations = []
+                system = ''
+                check_value = ''
+                for col_name, cell_value in row.items():
+                    if pd.isna(cell_value):
+                        continue
+                    
+                    if col_name == 'check':
+                        check_value = eval(cell_value)
+                    cell_value = re.sub(r'{{(?:export)}}', "", str(cell_value).strip()).strip()
+                    if col_name == 'system':
+                        system = replace_numbers(cell_value) if pd.notna(cell_value) else ''
+                        system = system.format_map(check_value)
+                        
+                    elif col_name.startswith('问') and pd.notna(cell_value):
+                        conversations.append({"from": "human", "value": cell_value})
+                    elif col_name.startswith('答') and pd.notna(cell_value):
+                        conversations.append({"from": "gpt", "value": cell_value})
+                
+                if conversations:
+                    sheet_data.append({"system": system,"conversations": conversations})
+                    all_data.append({"system": system,"conversations": conversations})
+        
+        json_data[sheet_name] = sheet_data
+        print(len(all_data))
+    return json_data ,all_data
+
+def save_json_per_sheet(json_data, output_folder):
+    for sheet_name, sheet_data in json_data.items():
+        output_file = os.path.join(output_folder, f'{sheet_name}.json')
+        with open(output_file, 'w', encoding='utf-8') as f_out:
+            json.dump(sheet_data, f_out, ensure_ascii=False, indent=4)
+
+
+def save_json_all(all_data, output_file):
+    with open(output_file, 'w', encoding='utf-8') as f_out:
+        json.dump(all_data, f_out, ensure_ascii=False, indent=4)
+    
+# Excel 文件路径
+excel_file_path = '../excel/决策/决策5.0.xlsx'
+# 输出 JSON 文件夹路径
+output_folder = '../excel/决策/json_per_sheet'
+
+
+# 确保输出文件夹存在,如果不存在则创建
+os.makedirs(output_folder, exist_ok=True)
+
+# 将 Excel 转换为 JSON 数据
+json_data,all_data = excel_to_json(excel_file_path)
+
+# 将每个 sheet 的 JSON 数据保存为单独的 JSON 文件
+save_json_per_sheet(json_data, output_folder)
+
+
+# 输出 JSON 文件
+output_file = '../excel/决策/out/juece.json'
+save_json_all(all_data, output_file)
+
+
+
+
+

+ 190 - 0
excelToJson.py

@@ -0,0 +1,190 @@
+# import xlrd
+# import json
+# import re
+# data1 = xlrd.open_workbook("../excel/进水总氮限制答案合并.xlsx")
+# sheet1 = data1.sheets()[0]
+# rows = sheet1.nrows
+# cols = sheet1.ncols
+# header = sheet1.row_values(0)
+
+# callTables = []
+
+
+# def mergefun(n,idx):
+#     # 判断当前单元格是否属于合并单元格
+#     is_merged = False
+#     merged_start_row = merged_end_row = None
+#     for rlo, rhi, clo, chi in sheet1.merged_cells:
+#         if clo == n and rlo <= idx < rhi:
+#             # print(rlo, rhi, clo, chi, n)
+#             is_merged = True
+#             merged_start_row = rlo
+#             merged_end_row = rhi
+#             break
+#     return [is_merged,merged_start_row,merged_end_row]
+
+
+# for n in range(cols):
+#     if n > 1:
+#         # 获取第 n 列的所有值
+#         col_data = sheet1.col_values(n)
+#         # 循环每一行
+#         for idx, cell_value in enumerate(col_data):
+#             # 剔除为空的值和第一行的值
+#             if idx == 0 or cell_value == '':
+#                 continue
+#             # 获取当前单元格
+#             cell = sheet1.cell(idx, n)
+            
+#             id_val = str(n) + '_' + str(idx + 1)  # 使用列号和行号组合作为唯一ID
+#             mainType = "alert" if "alert" in cell_value else "export" if "export" in cell_value else "text"
+            
+#             options = []
+#             twoDatas = []
+#             threedDatas=[]
+#             # 判断当前单元格是否属于合并单元格
+#             is_merged = False
+#             merged_start_row = merged_end_row = None
+#             for rlo, rhi, clo, chi in sheet1.merged_cells:
+#                 if clo == n and rlo <= idx+1 < rhi:
+#                     is_merged = True
+#                     merged_start_row = rlo
+#                     merged_end_row = rhi
+#                     break
+#             # 处理合并单元格
+#             if is_merged and "export" not in cell_value:
+#                 # print(f"第{n}列 第{idx+ 1}行 是合并单元格,合并范围:{merged_start_row + 1}行到{merged_end_row}行")
+#                 options_cells = sheet1.col_slice(n + 1, start_rowx=merged_start_row, end_rowx=merged_end_row)
+#                 twoDatas.extend([cell_value.value for cell_value in options_cells])
+#                 # print(twoDatas, n)
+#                 for indTwo, item in enumerate(twoDatas):
+#                     if not item.strip():
+#                         continue
+#                     itemTwo = {
+#                         "option": re.sub(r'{{(?:export|alert)}}\n', "", item),
+#                         "next": []
+#                     }
+#                     # 找到当前单元格的合并信息 
+
+#                     isTwo_merged, merged_start, merged_end = mergefun(n+1, indTwo + merged_start_row)
+#                     if isTwo_merged:
+#                         threed_cells = sheet1.col_slice(n + 2, start_rowx=merged_start, end_rowx=merged_end)
+#                         threedDatas.clear()
+#                         threedDatas.extend([newCell.value for newCell in threed_cells])
+                        
+#                         for indThree, itemThree in enumerate(threedDatas):
+#                             if not itemThree.strip():
+#                                 continue
+#                             itemTwo['next'].append(str(n + 2) + '_' + str(indThree + merged_start + 1))
+#                     else:
+#                         itemTwo['next'].append(str(n + 2) + '_' + str(indTwo + merged_start_row + 1))
+#                     options.append(itemTwo)
+#             else:
+#                 options= []
+#             item = {
+#                 "id": id_val,
+#                 "mainType": mainType,
+#                 "mainContent": re.sub(r'{{(?:export|alert)}}\n', "", cell_value),
+#                 "options": options
+#             }
+#             callTables.append(item)
+            
+# # 将解析的数据保存为JSON文件
+# with open("parsed_dataTwo.json", "w", encoding="utf-8") as json_file:
+#     json.dump(callTables, json_file, ensure_ascii=False, indent=4)
+
+
+
+
+import xlrd
+import json
+import re
+import os
+def check_merged(sheet, row, col):
+    for rlo, rhi, clo, chi in sheet.merged_cells:
+        if clo == col and rlo <= row < rhi:
+            return True, rlo, rhi
+    return False, None, None
+
+def parse_excel(sheet):
+    cols = sheet.ncols
+    call_tables = []
+    system = ''
+    boot=[]
+    n=0
+    for col in range(cols):
+        if sheet.cell(0, col).value in ['rule', 'system']:
+            system = sheet.cell(1, col).value
+            n=col
+            continue
+        col_data = sheet.col_values(col)
+        
+        for idx, cell_value in enumerate(col_data):
+            if idx == 0 or cell_value == '':
+                continue
+            id_val = f"{col}_{idx + 1}"
+            if col== n+1:
+               boot.append(id_val)
+            main_type = "alert" if "alert" in cell_value else "export" if "export" in cell_value else "text"
+            options = []
+
+            is_merged, merged_start_row, merged_end_row = check_merged(sheet, idx, col)
+            
+            if is_merged and "export" not in cell_value:
+                options_cells = sheet.col_slice(col + 1, start_rowx=merged_start_row, end_rowx=merged_end_row)
+                two_datas = [cell_value.value for cell_value in options_cells]
+                
+                for indTwo, item in enumerate(two_datas):
+                    if not item.strip():
+                        continue
+                    
+                    item_two = {
+                        "option": re.sub(r'{{(?:export|alert)}}\n', "", item.strip()),
+                        "next": []
+                    }
+                    
+                    is_two_merged, merged_start, merged_end = check_merged(sheet, indTwo + merged_start_row, col + 1)
+                    
+                    if is_two_merged:
+                        threed_cells = sheet.col_slice(col + 2, start_rowx=merged_start, end_rowx=merged_end)
+                        threed_datas = [new_cell.value for new_cell in threed_cells]
+                        
+                        for ind_three, item_three in enumerate(threed_datas):
+                            if not item_three.strip():
+                                continue
+                            item_two['next'].append(f"{col + 2}_{ind_three + merged_start + 1}")
+                    else:
+                        item_two['next'].append(f"{col + 2}_{indTwo + merged_start_row + 1}")
+                    
+                    options.append(item_two)
+            
+            item = {
+                "id": id_val,
+                "mainType": main_type,
+                "mainContent": re.sub(r'{{(?:export|alert)}}', "", cell_value.strip()).strip(),
+                "options": options
+            }
+            call_tables.append(item)
+            
+    result = {
+         "norm": sheet.name,
+         "system": system,
+         "boot":boot,
+         "questions": call_tables
+        }
+    return result
+
+def getAllSheets(file_path):
+    data = xlrd.open_workbook(file_path)
+    sheets = data.sheets()
+    directory = "../excel/决策/json/"
+    
+    for item in sheets: 
+        sheetName = item.name
+        parsed_data = parse_excel(item)
+        file_path = os.path.join(directory, str(sheetName+'.json'))
+        with open (file_path, "w", encoding="utf-8") as json_file:
+            json.dump(parsed_data, json_file, ensure_ascii=False, indent=4)
+            
+getAllSheets("../excel/决策/决策标注模版4.0.xlsx")
+

+ 331 - 0
formatData.py

@@ -0,0 +1,331 @@
+#查找当前文件所有json文件 输出到一个文件夹每个会话对象换行不在一个集合里面
+# import json
+# import os
+#
+# def convert_json_files(input_folder):
+#     all_conversations = []
+#     for root, dirs, files in os.walk(input_folder):
+#         for file_name in files:
+#             if file_name.endswith('.json'):
+#                 input_file = os.path.join(root, file_name)
+#                 with open(input_file, 'r', encoding='utf-8') as f_in:
+#                     for line in f_in:
+#                         if  not line.strip():
+#                             continue
+#                         try:
+#                             conversations = json.loads(line)
+#                             conversation_obj = {"conversations": []}
+#                             for i in range(0, len(conversations), 2):
+#                                 if i == 0:
+#                                     conversation_obj["conversations"].append({"role": "system", "content": "你是污水处理厂技术专家,针对用户提出的问题,提供专业见解和建议并且清晰有条理,解决用户提出的问题"})
+#                                 conversation_obj["conversations"].append({"role": "user", "content": conversations[i].lstrip("问:")})
+#                                 conversation_obj["conversations"].append({"role": "assistant", "content": conversations[i+1].lstrip("答:")})
+#                             all_conversations.append(conversation_obj)
+#                         except json.JSONDecodeError:
+#                             print("Error: Invalid JSON format in file:", input_file)
+#                             continue
+#     return all_conversations
+#
+#  #指定输入文件夹
+# input_folder = '/Users/yushanghui/hongshantianping/ai训练/data/optimizationData'
+#
+# all_conversations = convert_json_files(input_folder)
+#
+# # # 指定输出文件
+# output_file = '/Users/yushanghui/hongshantianping/ai训练/data/lineJson/2024.4.09.json'
+#
+# # 将所有对话写入输出文件
+# with open(output_file, 'w', encoding='utf-8') as f_out:
+#     for conversation_obj in all_conversations:
+#         json.dump(conversation_obj, f_out, ensure_ascii=False)
+#         f_out.write('\n')
+
+
+#这个可以查找文件夹里面所有的json文件并过滤
+# import json
+# import os
+#
+#
+# def convert_json_files(input_folder):
+#     all_conversations = []
+#     special_lines = []
+#     id = 0
+#     for root, dirs, files in os.walk(input_folder):
+#         for file_name in files:
+#             if file_name.endswith('.json'):
+#                 input_file = os.path.join(root, file_name)
+#                 with open(input_file, 'r', encoding='utf-8') as f_in:
+#                     for line in f_in:
+#                         if "如图所示:" in line:
+#                             special_lines.append(line)
+#                             continue
+#                         if not line.strip():
+#                             continue
+#                         try:
+#                             conversations = json.loads(line)
+#                             id += 1
+#                             conversation_obj = {"id":"identity_" + str(id),"conversations": []}
+#
+#                             for i in range(0, len(conversations), 2):
+#                                 # if i == 0:
+#                                 #     conversation_obj["conversations"].append({"role": "system","content": ""})
+#                                 conversation_obj["conversations"].append(
+#                                     {"from": "user", "value": conversations[i].lstrip("问:")})
+#                                 conversation_obj["conversations"].append(
+#                                     {"from": "assistant", "value": conversations[i + 1].lstrip("答:")})
+#                             all_conversations.append(conversation_obj)
+#                         except json.JSONDecodeError:
+#                             print("Error: Invalid JSON format in file:", input_file)
+#                             continue
+#
+#     # Write special lines to a separate JSON file
+#     if special_lines:
+#         special_file = '/Users/yushanghui/hongshantianping/ai训练/data/lineJson/special_file.json'  # Define the path for the special file
+#         with open(special_file, 'w', encoding='utf-8') as special_out:
+#             for line in special_lines:
+#                 special_out.write(line)
+#
+#     return all_conversations
+#
+#
+# # 指定输入文件夹
+# input_folder = '/Users/yushanghui/hongshantianping/ai训练/data/optimizationData'
+#
+# all_conversations = convert_json_files(input_folder)
+#
+# # 指定输出文件
+# output_file = '/Users/yushanghui/hongshantianping/ai训练/data/lineJson/hongshanData_water.json'
+#
+# # 将所有对话写入输出文件
+#
+# with open(output_file, 'w', encoding='utf-8') as f_out:
+#     json.dump(all_conversations, f_out, ensure_ascii=False, indent=4)
+
+
+# 输出到一个集合里面 并且格式化
+# import json
+# import os
+
+# def convert_json_files(input_folder):
+#     all_conversations = []
+#     for file_name in os.listdir(input_folder):
+#         if file_name.endswith('.json'):
+#             input_file = os.path.join(input_folder, file_name)
+#             with open(input_file, 'r', encoding='utf-8') as f_in:
+#                 for line in f_in:
+#                     try:
+#                         conversations = json.loads(line)
+#                         conversation_obj = {"conversations": []}
+#                         for i in range(0, len(conversations), 2):
+#                             if i == 0:
+#                                 conversation_obj["conversations"].append({"role": "system", "content": "假设你是一个污水处理厂技术专家,你要回答用户询问的问题."})
+#                             conversation_obj["conversations"].append({"role": "user", "content": conversations[i].lstrip("问:")})
+#                             conversation_obj["conversations"].append({"role": "assistant", "content": conversations[i+1].lstrip("答:")})
+#                         all_conversations.append(conversation_obj)
+#                     except json.JSONDecodeError:
+#                         print("Error: Invalid JSON format in file:", input_file)
+#                         continue
+#     return all_conversations
+
+# # 指定输入文件夹
+# input_folder = '/Users/yushanghui/hongshantianping/ai训练/data/jsondata/hebing'
+
+# all_conversations = convert_json_files(input_folder)
+
+# # 指定输出文件
+# output_file = '/Users/yushanghui/hongshantianping/ai训练/data/lineJson/output.json'
+
+# # 将所有对话写入输出文件并格式化
+# with open(output_file, 'w', encoding='utf-8') as f_out:
+#     json.dump(all_conversations, f_out, ensure_ascii=False, indent=4)
+
+
+
+# 输出的内容是数组并且每个对象会添加,以后换行
+# import json
+# import os
+
+# def convert_json_files(input_folder):
+#     all_conversations = []
+#     for file_name in os.listdir(input_folder):
+#         if file_name.endswith('.json'):
+#             input_file = os.path.join(input_folder, file_name)
+#             with open(input_file, 'r', encoding='utf-8') as f_in:
+#                 for line in f_in:
+#                     try:
+#                         conversations = json.loads(line)
+#                         conversation_obj = {"conversations": []}
+#                         for i in range(0, len(conversations), 2):
+#                             if i == 0:
+#                                 conversation_obj["conversations"].append({"role": "system", "content": "假设你是一个污水处理厂技术专家,你要回答用户询问的问题."})
+#                             conversation_obj["conversations"].append({"role": "user", "content": conversations[i].lstrip("问:")})
+#                             conversation_obj["conversations"].append({"role": "assistant", "content": conversations[i+1].lstrip("答:")})
+#                         all_conversations.append(conversation_obj)
+#                     except json.JSONDecodeError:
+#                         print("Error: Invalid JSON format in file:", input_file)
+#                         continue
+#     return all_conversations
+
+# # 指定输入文件夹
+# input_folder = '/Users/yushanghui/hongshantianping/ai训练/data/jsondata/hebing'
+
+# all_conversations = convert_json_files(input_folder)
+
+# # 指定输出文件
+# output_file = '/Users/yushanghui/hongshantianping/ai训练/data/lineJson/output.json'
+
+# # 将所有对话写入输出文件并格式化
+# with open(output_file, 'w', encoding='utf-8') as f_out:
+#     f_out.write("[\n")  # 开始列表
+#     for i, conversation_obj in enumerate(all_conversations):
+#         json.dump(conversation_obj, f_out, ensure_ascii=False)
+#         # f_out.write('\n')  # 每个对话对象后添加换行符
+#         if i < len(all_conversations) - 1:
+#             f_out.write(",\n")  # 在除了最后一个对话对象之后添加逗号和换行符
+#     f_out.write("]\n")  # 结束列表
+
+
+
+
+# qwen训练样本
+# import json
+# import os
+
+# def convert_json_files(input_folder):
+#     all_conversations = []
+#     special_lines = []
+#     for root, dirs, files in os.walk(input_folder):
+#         for file_name in files:
+#             if file_name.endswith('.json'):
+#                 input_file = os.path.join(root, file_name)
+#                 with open(input_file, 'r', encoding='utf-8') as f_in:
+#                     for line in f_in:
+#                         if "如图所示:" in line:
+#                             special_lines.append(line)
+#                             continue
+#                         if not line.strip():
+#                             continue
+#                         try:
+#                             conversations = json.loads(line)
+#                             conversation_obj = {"conversations": []}
+#                             print(len(len(conversations)))
+#                             for i in range(0, len(conversations), 2):
+#                                 # if i == 0:
+#                                 #     conversation_obj["conversations"].append({"role": "system","content": ""})
+#                                 conversation_obj["conversations"].append(
+#                                     {"role": "user", "content": conversations[i].lstrip("问:")})
+#                                 conversation_obj["conversations"].append(
+#                                     {"role": "assistant", "content": conversations[i + 1].lstrip("答:")})
+#                             all_conversations.append(conversation_obj)
+#                         except json.JSONDecodeError:
+#                             print("Error: Invalid JSON format in file:", input_file)
+#                             continue
+
+#     # Write special lines to a separate JSON file
+#     if special_lines:
+#         special_file = '/Users/yushanghui/hongshantianping/ai训练/data/lineJson/special_file.json'  # Define the path for the special file
+#         with open(special_file, 'w', encoding='utf-8') as special_out:
+#             for line in special_lines:
+#                 special_out.write(line)
+
+#     return all_conversations
+
+
+# # 指定输入文件夹
+# input_folder = '/Users/yushanghui/hongshantianping/ai训练/data/optimizationData'
+
+# all_conversations = convert_json_files(input_folder)
+
+# # 指定输出文件
+# output_file = '/Users/yushanghui/hongshantianping/ai训练/data/lineJson/hongshanData.json'
+
+# # 将所有对话写入输出文件
+
+# # with open(output_file, 'w', encoding='utf-8') as f_out:
+# #     json.dump(all_conversation, f_out, ensure_ascii=False, indent=4)
+
+# with open(output_file, 'w', encoding='utf-8') as f_out:
+#     for conversation_obj in all_conversations:
+#         json.dump(conversation_obj, f_out, ensure_ascii=False)
+#         f_out.write('\n')
+
+
+
+
+# qwen训练样本   
+import json
+import os
+import random
+def convert_json_files(input_folder):
+    all_conversations = []
+    special_lines = []
+    for root, dirs, files in os.walk(input_folder):
+        for file_name in files:
+            if file_name.endswith('.json'):
+                input_file = os.path.join(root, file_name)
+                with open(input_file, 'r', encoding='utf-8') as f_in:
+                    for line in f_in:
+                        if "如图所示:" in line:
+                            special_lines.append(line)
+                            continue
+                        if not line.strip():
+                            continue
+                        try:
+                            conversations = json.loads(line)
+                            conversation_obj = {
+                                "instruction": "",
+                                "input": "",
+                                "output": "",
+                                "history": []
+                            }
+                            for i in range(0, len(conversations), 2):
+                                # if i == 0:
+                                #     conversation_obj["conversations"].append({"role": "system","content": ""})
+                                if len(conversations)==2:
+                                    conversation_obj["instruction"]=conversations[i].lstrip("问:")
+                                    conversation_obj["output"]= conversations[i + 1].lstrip("答:")
+                                elif len(conversations)>2:
+                                    # todo 如果当前 i小于len(conversations)的最后两个执行下面的操作
+                                    if i < len(conversations)-2:
+                                        history = []
+                                        history.append(conversations[i].lstrip("问:"))
+                                        history.append(conversations[i + 1].lstrip("答:"))
+                                        conversation_obj["history"].append(history)
+                                    # todo 如果当前 i等于len(conversations)的最后两个元素执行下面的操作
+                                    elif i == len(conversations) - 2:  # 倒数第二个对话
+                                        conversation_obj["instruction"]=conversations[i].lstrip("问:")
+                                        conversation_obj["output"]= conversations[i + 1].lstrip("答:")
+                            all_conversations.append(conversation_obj)
+                                        
+                        except json.JSONDecodeError:
+                            print("Error: Invalid JSON format in file:", input_file)
+                            continue
+
+    # Write special lines to a separate JSON file
+    if special_lines:
+        special_file = '/Users/yushanghui/hongshantianping/ai训练/data/lineJson/special_file.json'  # Define the path for the special file
+        with open(special_file, 'w', encoding='utf-8') as special_out:
+            for line in special_lines:
+                special_out.write(line)
+
+    return all_conversations
+
+
+# 指定输入文件夹
+input_folder = '/Users/yushanghui/hongshantianping/ai训练/data/孙浩prompt'
+
+all_conversations = convert_json_files(input_folder)
+print(len(all_conversations))
+# 指定输出文件
+output_file = '/Users/yushanghui/hongshantianping/ai训练/data/lineJson/merged.json'
+
+# 将所有对话写入输出文件 随机打乱数据
+random.shuffle(all_conversations)
+
+with open(output_file, 'w', encoding='utf-8') as f_out:
+    json.dump(all_conversations, f_out, ensure_ascii=False, indent=4)
+
+# with open(output_file, 'w', encoding='utf-8') as f_out:
+#     for conversation_obj in all_conversations:
+#         json.dump(conversation_obj, f_out, ensure_ascii=False)
+#         f_out.write('\n')

+ 60 - 0
jiexiExcel.py

@@ -0,0 +1,60 @@
+import os
+import xlrd
+import json
+
+def parse_excel_files(directory):
+    result = []
+    # 遍历指定目录下的所有文件和文件夹
+    for root, dirs, files in os.walk(directory):
+        for file in files:
+            # 仅处理扩展名为 .xlsx 或 .xls 的文件
+            if file.endswith('.xlsx') or file.endswith('.xls'):
+                file_path = os.path.join(root, file)
+                
+                # 使用 xlrd 打开 Excel 文件
+                workbook = xlrd.open_workbook(file_path)
+                # 遍历每个 sheet
+                for sheet in workbook.sheets():
+                    rows = sheet.nrows
+                    for row in range(rows):
+                        row_data = []
+                        # 跳过表头
+                        if row == 0:
+                            continue
+                        # 跳过空行
+                        if all(sheet.cell_value(row, col) == '' for col in range(sheet.ncols)):
+                            continue
+                        # 遍历每一列
+                        for col in range(sheet.ncols):
+                            # 获取表头
+                            header = sheet.cell_value(0, col)
+                            # 获取单元格值
+                            cell_value = sheet.cell_value(row, col)
+                            if cell_value:
+                                if header=='问':
+                                    cell_value= str('问:'+cell_value)
+                                    row_data.append(cell_value)
+                                if header=='答':
+                                    cell_value= str('答:'+cell_value)
+                                    row_data.append(cell_value)
+                        # 将有效行数据添加到结果中
+                        if row_data:
+                            result.append(row_data)
+    return result
+
+def write_data_to_file(data, output_file):
+    with open(output_file, 'w', encoding='utf-8') as f:
+        # 遍历每行内容
+        for row in data:
+            # 使用json.dumps将行数据转换为JSON格式的字符串
+            json_row = json.dumps(row, ensure_ascii=False)
+            # 写入文件并添加换行符
+            f.write(json_row + '\n')
+
+# 指定目录路径和输出文件路径
+input_directory = '/Users/yushanghui/Desktop/比较接近水务的书籍/公众号'
+output_file = '/Users/yushanghui/Desktop/比较接近水务的书籍/公众号/all_out.json'
+
+# 解析 Excel 文件并写入数据到文件
+excel_data = parse_excel_files(input_directory)
+write_data_to_file(excel_data, output_file)

+ 28 - 0
merge.py

@@ -0,0 +1,28 @@
+import os
+import json
+import random
+
+def merge_json_files(input_folder, output_file):
+    all_data = []
+
+    # 遍历输入文件夹中的所有文件
+    for filename in os.listdir(input_folder):
+        if filename.endswith('.json'):
+            file_path = os.path.join(input_folder, filename)
+            with open(file_path, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+                all_data.extend(data)
+                
+    # 随机化数据顺序
+    random.shuffle(all_data)
+
+    # 将所有数据写入到输出文件中
+    with open(output_file, 'w', encoding='utf-8') as f_out:
+        json.dump(all_data, f_out, ensure_ascii=False, indent=2)
+
+# 设置输入文件夹和输出文件名
+input_folder = '../excel/决策/out/'  # 输入文件夹路径
+output_file = '../excel/决策/out/merged.json'  # 输出文件名
+
+# 调用函数合并 JSON 文件
+merge_json_files(input_folder, output_file)