vor 10 Monaten · acedf0d4ef
--- a/__pycache__/random.cpython-38.pyc
+++ b/__pycache__/random.cpython-38.pyc
--- a/checkBad.py
+++ b/checkBad.py
@@ -0,0 +1,77 @@
 
				+import json

			
 
				+

			
 
				+def convert_json_files(input_file):

			
 
				+    filter_lines = []

			
 
				+    bad_lines = []

			
 
				+    all_wen = set()  # 使用集合存储已经出现的元素，以提高查找效率

			
 
				+    seen_items = []  # 用于记录重复出现的条目

			
 
				+    short_lines = []  # 存储比较短的集合

			
 
				+    

			
 
				+    with open(input_file, 'r', encoding='utf-8') as f_in:

			
 
				+        for item in f_in:

			
 
				+            # 解析JSON字符串

			
 
				+            try:

			
 
				+                json_obj = json.loads(item)

			
 
				+            except json.JSONDecodeError:

			
 
				+                # JSON解析错误，将其视为"坏行"

			
 
				+                bad_lines.append(item)

			
 
				+                continue

			
 
				+            

			
 
				+            # 获取JSON对象的第一个元素

			
 
				+            first_element = json_obj[0] if json_obj else None

			
 
				+            

			
 
				+            if first_element is None:

			
 
				+                # 如果JSON对象为空，将其视为"坏行"

			
 
				+                bad_lines.append(item)

			
 
				+                continue

			
 
				+            

			
 
				+            if "如图所示" in item or "详见图" in item or "[耿春女，高阳俊，李丹 主编]" in item :

			
 
				+                # 如果第一个元素包含"如图所示"，将其视为"坏行"

			
 
				+                bad_lines.append(item)

			
 
				+                continue

			
 
				+            

			
 
				+            if first_element in all_wen:

			
 
				+                # 如果第一个元素已经出现过，将其视为重复出现的条目

			
 
				+                seen_items.append(item)

			
 
				+                continue

			
 
				+            

			
 
				+            if len(item) < 100:

			
 
				+                # 如果集合长度小于100，将其视为比较短的集合

			
 
				+                short_lines.append(item)

			
 
				+                continue

			
 
				+            

			
 
				+            # 添加第一个元素到已经出现的元素集合中

			
 
				+            all_wen.add(first_element)

			
 
				+            filter_lines.append(item)

			
 
				+

			
 
				+    return filter_lines, bad_lines, seen_items, short_lines

			
 
				+

			
 
				+# 指定输入文件

			
 
				+input_file = '/Users/yushanghui/Desktop/比较接近水务的书籍/公众号/all_out.json'

			
 
				+# 指定输出文件

			
 
				+bad_file = '/Users/yushanghui/Desktop/比较接近水务的书籍/公众号/bad.json'

			
 
				+filter_file = '/Users/yushanghui/Desktop/比较接近水务的书籍/公众号/good.json'

			
 
				+loop_file = '/Users/yushanghui/Desktop/比较接近水务的书籍/公众号/loop.json'

			
 
				+short_file = '/Users/yushanghui/Desktop/比较接近水务的书籍/公众号/short.json'

			
 
				+# 调用函数进行转换

			
 
				+filter_lines, bad_lines, seen_items, short_lines = convert_json_files(input_file)

			
 
				+

			
 
				+# 写入 good.json

			
 
				+with open(filter_file, 'w', encoding='utf-8') as f_out:

			
 
				+    for line in filter_lines:

			
 
				+        f_out.write(line)

			
 
				+

			
 
				+# 写入 bad.json

			
 
				+with open(bad_file, 'w', encoding='utf-8') as f_out:

			
 
				+    for line in bad_lines:

			
 
				+        f_out.write(line)

			
 
				+

			
 
				+# 写入 loop.json

			
 
				+with open(loop_file, 'w', encoding='utf-8') as f_out:

			
 
				+    for line in seen_items:

			
 
				+        f_out.write(line)

			
 
				+

			
 
				+# 写入 short.json

			
 
				+with open(short_file, 'w', encoding='utf-8') as f_out:

			
 
				+    for line in short_lines:

			
 
				+        f_out.write(line)

			
--- a/excel.py
+++ b/excel.py
@@ -0,0 +1,105 @@
 
				+import pandas as pd

			
 
				+import json

			
 
				+import re

			
 
				+import random

			
 
				+import os

			
 
				+

			
 
				+def replace_numbers(text):

			
 
				+    pattern = r'(当前值\s*)(\d+(\.\d+)?)'  # 匹配"当前的值"后面的数字

			
 
				+

			
 
				+    def repl(match):

			
 
				+        num_str = match.group(2)

			
 
				+        num = float(num_str)

			
 
				+        

			
 
				+        if '.' in num_str:

			
 
				+            random_offset = round(random.uniform(-0.001, 0.05), 2)

			
 
				+        elif len(num_str) == 1:

			
 
				+            random_offset = round(random.uniform(-0.05, 0.05), 2)

			
 
				+        elif len(num_str) == 2:

			
 
				+            random_offset = round(random.uniform(-1.00, 1.00), 2)

			
 
				+        else:

			
 
				+            random_offset = round(random.uniform(-2, 2), 2)

			
 
				+        

			
 
				+        new_num = num + random_offset

			
 
				+        formatted_num = '{:.2f}'.format(new_num)

			
 
				+        return match.group(1) + formatted_num

			
 
				+

			
 
				+    new_text = re.sub(pattern, repl, text)

			
 
				+    return new_text

			
 
				+

			
 
				+def excel_to_json(excel_file_path):

			
 
				+    excel_data = pd.ExcelFile(excel_file_path)

			
 
				+    sheet_names = excel_data.sheet_names

			
 
				+    

			
 
				+    json_data = {}

			
 
				+    all_data = []

			
 
				+    

			
 
				+    for sheet_name in sheet_names:

			
 
				+        sheet_data = []

			
 
				+        df_sheet = pd.read_excel(excel_data, sheet_name=sheet_name)

			
 
				+        

			
 
				+        

			
 
				+        for _ in range(12):

			
 
				+            for _, row in df_sheet.iterrows():

			
 
				+                conversations = []

			
 
				+                system = ''

			
 
				+                check_value = ''

			
 
				+                for col_name, cell_value in row.items():

			
 
				+                    if pd.isna(cell_value):

			
 
				+                        continue

			
 
				+                    

			
 
				+                    if col_name == 'check':

			
 
				+                        check_value = eval(cell_value)

			
 
				+                    cell_value = re.sub(r'{{(?:export)}}', "", str(cell_value).strip()).strip()

			
 
				+                    if col_name == 'system':

			
 
				+                        system = replace_numbers(cell_value) if pd.notna(cell_value) else ''

			
 
				+                        system = system.format_map(check_value)

			
 
				+                        

			
 
				+                    elif col_name.startswith('问') and pd.notna(cell_value):

			
 
				+                        conversations.append({"from": "human", "value": cell_value})

			
 
				+                    elif col_name.startswith('答') and pd.notna(cell_value):

			
 
				+                        conversations.append({"from": "gpt", "value": cell_value})

			
 
				+                

			
 
				+                if conversations:

			
 
				+                    sheet_data.append({"system": system,"conversations": conversations})

			
 
				+                    all_data.append({"system": system,"conversations": conversations})

			
 
				+        

			
 
				+        json_data[sheet_name] = sheet_data

			
 
				+        print(len(all_data))

			
 
				+    return json_data ,all_data

			
 
				+

			
 
				+def save_json_per_sheet(json_data, output_folder):

			
 
				+    for sheet_name, sheet_data in json_data.items():

			
 
				+        output_file = os.path.join(output_folder, f'{sheet_name}.json')

			
 
				+        with open(output_file, 'w', encoding='utf-8') as f_out:

			
 
				+            json.dump(sheet_data, f_out, ensure_ascii=False, indent=4)

			
 
				+

			
 
				+

			
 
				+def save_json_all(all_data, output_file):

			
 
				+    with open(output_file, 'w', encoding='utf-8') as f_out:

			
 
				+        json.dump(all_data, f_out, ensure_ascii=False, indent=4)

			
 
				+    

			
 
				+# Excel 文件路径

			
 
				+excel_file_path = '../excel/决策/决策5.0.xlsx'

			
 
				+# 输出 JSON 文件夹路径

			
 
				+output_folder = '../excel/决策/json_per_sheet'

			
 
				+

			
 
				+

			
 
				+# 确保输出文件夹存在，如果不存在则创建

			
 
				+os.makedirs(output_folder, exist_ok=True)

			
 
				+

			
 
				+# 将 Excel 转换为 JSON 数据

			
 
				+json_data,all_data = excel_to_json(excel_file_path)

			
 
				+

			
 
				+# 将每个 sheet 的 JSON 数据保存为单独的 JSON 文件

			
 
				+save_json_per_sheet(json_data, output_folder)

			
 
				+

			
 
				+

			
 
				+# 输出 JSON 文件

			
 
				+output_file = '../excel/决策/out/juece.json'

			
 
				+save_json_all(all_data, output_file)

			
 
				+

			
 
				+

			
 
				+

			
 
				+

			
 
				+

			
--- a/excelToJson.py
+++ b/excelToJson.py
@@ -0,0 +1,190 @@
 
				+# import xlrd

			
 
				+# import json

			
 
				+# import re

			
 
				+# data1 = xlrd.open_workbook("../excel/进水总氮限制答案合并.xlsx")

			
 
				+# sheet1 = data1.sheets()[0]

			
 
				+# rows = sheet1.nrows

			
 
				+# cols = sheet1.ncols

			
 
				+# header = sheet1.row_values(0)

			
 
				+

			
 
				+# callTables = []

			
 
				+

			
 
				+

			
 
				+# def mergefun(n,idx):

			
 
				+#     # 判断当前单元格是否属于合并单元格

			
 
				+#     is_merged = False

			
 
				+#     merged_start_row = merged_end_row = None

			
 
				+#     for rlo, rhi, clo, chi in sheet1.merged_cells:

			
 
				+#         if clo == n and rlo <= idx < rhi:

			
 
				+#             # print(rlo, rhi, clo, chi, n)

			
 
				+#             is_merged = True

			
 
				+#             merged_start_row = rlo

			
 
				+#             merged_end_row = rhi

			
 
				+#             break

			
 
				+#     return [is_merged,merged_start_row,merged_end_row]

			
 
				+

			
 
				+

			
 
				+# for n in range(cols):

			
 
				+#     if n > 1:

			
 
				+#         # 获取第 n 列的所有值

			
 
				+#         col_data = sheet1.col_values(n)

			
 
				+#         # 循环每一行

			
 
				+#         for idx, cell_value in enumerate(col_data):

			
 
				+#             # 剔除为空的值和第一行的值

			
 
				+#             if idx == 0 or cell_value == '':

			
 
				+#                 continue

			
 
				+#             # 获取当前单元格

			
 
				+#             cell = sheet1.cell(idx, n)

			
 
				+            

			
 
				+#             id_val = str(n) + '_' + str(idx + 1)  # 使用列号和行号组合作为唯一ID

			
 
				+#             mainType = "alert" if "alert" in cell_value else "export" if "export" in cell_value else "text"

			
 
				+            

			
 
				+#             options = []

			
 
				+#             twoDatas = []

			
 
				+#             threedDatas=[]

			
 
				+#             # 判断当前单元格是否属于合并单元格

			
 
				+#             is_merged = False

			
 
				+#             merged_start_row = merged_end_row = None

			
 
				+#             for rlo, rhi, clo, chi in sheet1.merged_cells:

			
 
				+#                 if clo == n and rlo <= idx+1 < rhi:

			
 
				+#                     is_merged = True

			
 
				+#                     merged_start_row = rlo

			
 
				+#                     merged_end_row = rhi

			
 
				+#                     break

			
 
				+#             # 处理合并单元格

			
 
				+#             if is_merged and "export" not in cell_value:

			
 
				+#                 # print(f"第{n}列 第{idx+ 1}行 是合并单元格，合并范围：{merged_start_row + 1}行到{merged_end_row}行")

			
 
				+#                 options_cells = sheet1.col_slice(n + 1, start_rowx=merged_start_row, end_rowx=merged_end_row)

			
 
				+#                 twoDatas.extend([cell_value.value for cell_value in options_cells])

			
 
				+#                 # print(twoDatas, n)

			
 
				+#                 for indTwo, item in enumerate(twoDatas):

			
 
				+#                     if not item.strip():

			
 
				+#                         continue

			
 
				+#                     itemTwo = {

			
 
				+#                         "option": re.sub(r'{{(?:export|alert)}}\n', "", item),

			
 
				+#                         "next": []

			
 
				+#                     }

			
 
				+#                     # 找到当前单元格的合并信息 

			
 
				+

			
 
				+#                     isTwo_merged, merged_start, merged_end = mergefun(n+1, indTwo + merged_start_row)

			
 
				+#                     if isTwo_merged:

			
 
				+#                         threed_cells = sheet1.col_slice(n + 2, start_rowx=merged_start, end_rowx=merged_end)

			
 
				+#                         threedDatas.clear()

			
 
				+#                         threedDatas.extend([newCell.value for newCell in threed_cells])

			
 
				+                        

			
 
				+#                         for indThree, itemThree in enumerate(threedDatas):

			
 
				+#                             if not itemThree.strip():

			
 
				+#                                 continue

			
 
				+#                             itemTwo['next'].append(str(n + 2) + '_' + str(indThree + merged_start + 1))

			
 
				+#                     else:

			
 
				+#                         itemTwo['next'].append(str(n + 2) + '_' + str(indTwo + merged_start_row + 1))

			
 
				+#                     options.append(itemTwo)

			
 
				+#             else:

			
 
				+#                 options= []

			
 
				+#             item = {

			
 
				+#                 "id": id_val,

			
 
				+#                 "mainType": mainType,

			
 
				+#                 "mainContent": re.sub(r'{{(?:export|alert)}}\n', "", cell_value),

			
 
				+#                 "options": options

			
 
				+#             }

			
 
				+#             callTables.append(item)

			
 
				+            

			
 
				+# # 将解析的数据保存为JSON文件

			
 
				+# with open("parsed_dataTwo.json", "w", encoding="utf-8") as json_file:

			
 
				+#     json.dump(callTables, json_file, ensure_ascii=False, indent=4)

			
 
				+

			
 
				+

			
 
				+

			
 
				+

			
 
				+import xlrd

			
 
				+import json

			
 
				+import re

			
 
				+import os

			
 
				+def check_merged(sheet, row, col):

			
 
				+    for rlo, rhi, clo, chi in sheet.merged_cells:

			
 
				+        if clo == col and rlo <= row < rhi:

			
 
				+            return True, rlo, rhi

			
 
				+    return False, None, None

			
 
				+

			
 
				+def parse_excel(sheet):

			
 
				+    cols = sheet.ncols

			
 
				+    call_tables = []

			
 
				+    system = ''

			
 
				+    boot=[]

			
 
				+    n=0

			
 
				+    for col in range(cols):

			
 
				+        if sheet.cell(0, col).value in ['rule', 'system']:

			
 
				+            system = sheet.cell(1, col).value

			
 
				+            n=col

			
 
				+            continue

			
 
				+        col_data = sheet.col_values(col)

			
 
				+        

			
 
				+        for idx, cell_value in enumerate(col_data):

			
 
				+            if idx == 0 or cell_value == '':

			
 
				+                continue

			
 
				+            id_val = f"{col}_{idx + 1}"

			
 
				+            if col== n+1:

			
 
				+               boot.append(id_val)

			
 
				+            main_type = "alert" if "alert" in cell_value else "export" if "export" in cell_value else "text"

			
 
				+            options = []

			
 
				+

			
 
				+            is_merged, merged_start_row, merged_end_row = check_merged(sheet, idx, col)

			
 
				+            

			
 
				+            if is_merged and "export" not in cell_value:

			
 
				+                options_cells = sheet.col_slice(col + 1, start_rowx=merged_start_row, end_rowx=merged_end_row)

			
 
				+                two_datas = [cell_value.value for cell_value in options_cells]

			
 
				+                

			
 
				+                for indTwo, item in enumerate(two_datas):

			
 
				+                    if not item.strip():

			
 
				+                        continue

			
 
				+                    

			
 
				+                    item_two = {

			
 
				+                        "option": re.sub(r'{{(?:export|alert)}}\n', "", item.strip()),

			
 
				+                        "next": []

			
 
				+                    }

			
 
				+                    

			
 
				+                    is_two_merged, merged_start, merged_end = check_merged(sheet, indTwo + merged_start_row, col + 1)

			
 
				+                    

			
 
				+                    if is_two_merged:

			
 
				+                        threed_cells = sheet.col_slice(col + 2, start_rowx=merged_start, end_rowx=merged_end)

			
 
				+                        threed_datas = [new_cell.value for new_cell in threed_cells]

			
 
				+                        

			
 
				+                        for ind_three, item_three in enumerate(threed_datas):

			
 
				+                            if not item_three.strip():

			
 
				+                                continue

			
 
				+                            item_two['next'].append(f"{col + 2}_{ind_three + merged_start + 1}")

			
 
				+                    else:

			
 
				+                        item_two['next'].append(f"{col + 2}_{indTwo + merged_start_row + 1}")

			
 
				+                    

			
 
				+                    options.append(item_two)

			
 
				+            

			
 
				+            item = {

			
 
				+                "id": id_val,

			
 
				+                "mainType": main_type,

			
 
				+                "mainContent": re.sub(r'{{(?:export|alert)}}', "", cell_value.strip()).strip(),

			
 
				+                "options": options

			
 
				+            }

			
 
				+            call_tables.append(item)

			
 
				+            

			
 
				+    result = {

			
 
				+         "norm": sheet.name,

			
 
				+         "system": system,

			
 
				+         "boot":boot,

			
 
				+         "questions": call_tables

			
 
				+        }

			
 
				+    return result

			
 
				+

			
 
				+def getAllSheets(file_path):

			
 
				+    data = xlrd.open_workbook(file_path)

			
 
				+    sheets = data.sheets()

			
 
				+    directory = "../excel/决策/json/"

			
 
				+    

			
 
				+    for item in sheets: 

			
 
				+        sheetName = item.name

			
 
				+        parsed_data = parse_excel(item)

			
 
				+        file_path = os.path.join(directory, str(sheetName+'.json'))

			
 
				+        with open (file_path, "w", encoding="utf-8") as json_file:

			
 
				+            json.dump(parsed_data, json_file, ensure_ascii=False, indent=4)

			
 
				+            

			
 
				+getAllSheets("../excel/决策/决策标注模版4.0.xlsx")

			
 
				+

			
--- a/formatData.py
+++ b/formatData.py
@@ -0,0 +1,331 @@
 
				+#查找当前文件所有json文件 输出到一个文件夹每个会话对象换行不在一个集合里面

			
 
				+# import json

			
 
				+# import os

			
 
				+#

			
 
				+# def convert_json_files(input_folder):

			
 
				+#     all_conversations = []

			
 
				+#     for root, dirs, files in os.walk(input_folder):

			
 
				+#         for file_name in files:

			
 
				+#             if file_name.endswith('.json'):

			
 
				+#                 input_file = os.path.join(root, file_name)

			
 
				+#                 with open(input_file, 'r', encoding='utf-8') as f_in:

			
 
				+#                     for line in f_in:

			
 
				+#                         if  not line.strip():

			
 
				+#                             continue

			
 
				+#                         try:

			
 
				+#                             conversations = json.loads(line)

			
 
				+#                             conversation_obj = {"conversations": []}

			
 
				+#                             for i in range(0, len(conversations), 2):

			
 
				+#                                 if i == 0:

			
 
				+#                                     conversation_obj["conversations"].append({"role": "system", "content": "你是污水处理厂技术专家，针对用户提出的问题，提供专业见解和建议并且清晰有条理，解决用户提出的问题"})

			
 
				+#                                 conversation_obj["conversations"].append({"role": "user", "content": conversations[i].lstrip("问：")})

			
 
				+#                                 conversation_obj["conversations"].append({"role": "assistant", "content": conversations[i+1].lstrip("答：")})

			
 
				+#                             all_conversations.append(conversation_obj)

			
 
				+#                         except json.JSONDecodeError:

			
 
				+#                             print("Error: Invalid JSON format in file:", input_file)

			
 
				+#                             continue

			
 
				+#     return all_conversations

			
 
				+#

			
 
				+#  #指定输入文件夹

			
 
				+# input_folder = '/Users/yushanghui/hongshantianping/ai训练/data/optimizationData'

			
 
				+#

			
 
				+# all_conversations = convert_json_files(input_folder)

			
 
				+#

			
 
				+# # # 指定输出文件

			
 
				+# output_file = '/Users/yushanghui/hongshantianping/ai训练/data/lineJson/2024.4.09.json'

			
 
				+#

			
 
				+# # 将所有对话写入输出文件

			
 
				+# with open(output_file, 'w', encoding='utf-8') as f_out:

			
 
				+#     for conversation_obj in all_conversations:

			
 
				+#         json.dump(conversation_obj, f_out, ensure_ascii=False)

			
 
				+#         f_out.write('\n')

			
 
				+

			
 
				+

			
 
				+#这个可以查找文件夹里面所有的json文件并过滤

			
 
				+# import json

			
 
				+# import os

			
 
				+#

			
 
				+#

			
 
				+# def convert_json_files(input_folder):

			
 
				+#     all_conversations = []

			
 
				+#     special_lines = []

			
 
				+#     id = 0

			
 
				+#     for root, dirs, files in os.walk(input_folder):

			
 
				+#         for file_name in files:

			
 
				+#             if file_name.endswith('.json'):

			
 
				+#                 input_file = os.path.join(root, file_name)

			
 
				+#                 with open(input_file, 'r', encoding='utf-8') as f_in:

			
 
				+#                     for line in f_in:

			
 
				+#                         if "如图所示：" in line:

			
 
				+#                             special_lines.append(line)

			
 
				+#                             continue

			
 
				+#                         if not line.strip():

			
 
				+#                             continue

			
 
				+#                         try:

			
 
				+#                             conversations = json.loads(line)

			
 
				+#                             id += 1

			
 
				+#                             conversation_obj = {"id":"identity_" + str(id),"conversations": []}

			
 
				+#

			
 
				+#                             for i in range(0, len(conversations), 2):

			
 
				+#                                 # if i == 0:

			
 
				+#                                 #     conversation_obj["conversations"].append({"role": "system","content": ""})

			
 
				+#                                 conversation_obj["conversations"].append(

			
 
				+#                                     {"from": "user", "value": conversations[i].lstrip("问：")})

			
 
				+#                                 conversation_obj["conversations"].append(

			
 
				+#                                     {"from": "assistant", "value": conversations[i + 1].lstrip("答：")})

			
 
				+#                             all_conversations.append(conversation_obj)

			
 
				+#                         except json.JSONDecodeError:

			
 
				+#                             print("Error: Invalid JSON format in file:", input_file)

			
 
				+#                             continue

			
 
				+#

			
 
				+#     # Write special lines to a separate JSON file

			
 
				+#     if special_lines:

			
 
				+#         special_file = '/Users/yushanghui/hongshantianping/ai训练/data/lineJson/special_file.json'  # Define the path for the special file

			
 
				+#         with open(special_file, 'w', encoding='utf-8') as special_out:

			
 
				+#             for line in special_lines:

			
 
				+#                 special_out.write(line)

			
 
				+#

			
 
				+#     return all_conversations

			
 
				+#

			
 
				+#

			
 
				+# # 指定输入文件夹

			
 
				+# input_folder = '/Users/yushanghui/hongshantianping/ai训练/data/optimizationData'

			
 
				+#

			
 
				+# all_conversations = convert_json_files(input_folder)

			
 
				+#

			
 
				+# # 指定输出文件

			
 
				+# output_file = '/Users/yushanghui/hongshantianping/ai训练/data/lineJson/hongshanData_water.json'

			
 
				+#

			
 
				+# # 将所有对话写入输出文件

			
 
				+#

			
 
				+# with open(output_file, 'w', encoding='utf-8') as f_out:

			
 
				+#     json.dump(all_conversations, f_out, ensure_ascii=False, indent=4)

			
 
				+

			
 
				+

			
 
				+# 输出到一个集合里面 并且格式化

			
 
				+# import json

			
 
				+# import os

			
 
				+

			
 
				+# def convert_json_files(input_folder):

			
 
				+#     all_conversations = []

			
 
				+#     for file_name in os.listdir(input_folder):

			
 
				+#         if file_name.endswith('.json'):

			
 
				+#             input_file = os.path.join(input_folder, file_name)

			
 
				+#             with open(input_file, 'r', encoding='utf-8') as f_in:

			
 
				+#                 for line in f_in:

			
 
				+#                     try:

			
 
				+#                         conversations = json.loads(line)

			
 
				+#                         conversation_obj = {"conversations": []}

			
 
				+#                         for i in range(0, len(conversations), 2):

			
 
				+#                             if i == 0:

			
 
				+#                                 conversation_obj["conversations"].append({"role": "system", "content": "假设你是一个污水处理厂技术专家，你要回答用户询问的问题."})

			
 
				+#                             conversation_obj["conversations"].append({"role": "user", "content": conversations[i].lstrip("问：")})

			
 
				+#                             conversation_obj["conversations"].append({"role": "assistant", "content": conversations[i+1].lstrip("答：")})

			
 
				+#                         all_conversations.append(conversation_obj)

			
 
				+#                     except json.JSONDecodeError:

			
 
				+#                         print("Error: Invalid JSON format in file:", input_file)

			
 
				+#                         continue

			
 
				+#     return all_conversations

			
 
				+

			
 
				+# # 指定输入文件夹

			
 
				+# input_folder = '/Users/yushanghui/hongshantianping/ai训练/data/jsondata/hebing'

			
 
				+

			
 
				+# all_conversations = convert_json_files(input_folder)

			
 
				+

			
 
				+# # 指定输出文件

			
 
				+# output_file = '/Users/yushanghui/hongshantianping/ai训练/data/lineJson/output.json'

			
 
				+

			
 
				+# # 将所有对话写入输出文件并格式化

			
 
				+# with open(output_file, 'w', encoding='utf-8') as f_out:

			
 
				+#     json.dump(all_conversations, f_out, ensure_ascii=False, indent=4)

			
 
				+

			
 
				+

			
 
				+

			
 
				+# 输出的内容是数组并且每个对象会添加,以后换行

			
 
				+# import json

			
 
				+# import os

			
 
				+

			
 
				+# def convert_json_files(input_folder):

			
 
				+#     all_conversations = []

			
 
				+#     for file_name in os.listdir(input_folder):

			
 
				+#         if file_name.endswith('.json'):

			
 
				+#             input_file = os.path.join(input_folder, file_name)

			
 
				+#             with open(input_file, 'r', encoding='utf-8') as f_in:

			
 
				+#                 for line in f_in:

			
 
				+#                     try:

			
 
				+#                         conversations = json.loads(line)

			
 
				+#                         conversation_obj = {"conversations": []}

			
 
				+#                         for i in range(0, len(conversations), 2):

			
 
				+#                             if i == 0:

			
 
				+#                                 conversation_obj["conversations"].append({"role": "system", "content": "假设你是一个污水处理厂技术专家，你要回答用户询问的问题."})

			
 
				+#                             conversation_obj["conversations"].append({"role": "user", "content": conversations[i].lstrip("问：")})

			
 
				+#                             conversation_obj["conversations"].append({"role": "assistant", "content": conversations[i+1].lstrip("答：")})

			
 
				+#                         all_conversations.append(conversation_obj)

			
 
				+#                     except json.JSONDecodeError:

			
 
				+#                         print("Error: Invalid JSON format in file:", input_file)

			
 
				+#                         continue

			
 
				+#     return all_conversations

			
 
				+

			
 
				+# # 指定输入文件夹

			
 
				+# input_folder = '/Users/yushanghui/hongshantianping/ai训练/data/jsondata/hebing'

			
 
				+

			
 
				+# all_conversations = convert_json_files(input_folder)

			
 
				+

			
 
				+# # 指定输出文件

			
 
				+# output_file = '/Users/yushanghui/hongshantianping/ai训练/data/lineJson/output.json'

			
 
				+

			
 
				+# # 将所有对话写入输出文件并格式化

			
 
				+# with open(output_file, 'w', encoding='utf-8') as f_out:

			
 
				+#     f_out.write("[\n")  # 开始列表

			
 
				+#     for i, conversation_obj in enumerate(all_conversations):

			
 
				+#         json.dump(conversation_obj, f_out, ensure_ascii=False)

			
 
				+#         # f_out.write('\n')  # 每个对话对象后添加换行符

			
 
				+#         if i < len(all_conversations) - 1:

			
 
				+#             f_out.write(",\n")  # 在除了最后一个对话对象之后添加逗号和换行符

			
 
				+#     f_out.write("]\n")  # 结束列表

			
 
				+

			
 
				+

			
 
				+

			
 
				+

			
 
				+# qwen训练样本

			
 
				+# import json

			
 
				+# import os

			
 
				+

			
 
				+# def convert_json_files(input_folder):

			
 
				+#     all_conversations = []

			
 
				+#     special_lines = []

			
 
				+#     for root, dirs, files in os.walk(input_folder):

			
 
				+#         for file_name in files:

			
 
				+#             if file_name.endswith('.json'):

			
 
				+#                 input_file = os.path.join(root, file_name)

			
 
				+#                 with open(input_file, 'r', encoding='utf-8') as f_in:

			
 
				+#                     for line in f_in:

			
 
				+#                         if "如图所示：" in line:

			
 
				+#                             special_lines.append(line)

			
 
				+#                             continue

			
 
				+#                         if not line.strip():

			
 
				+#                             continue

			
 
				+#                         try:

			
 
				+#                             conversations = json.loads(line)

			
 
				+#                             conversation_obj = {"conversations": []}

			
 
				+#                             print(len(len(conversations)))

			
 
				+#                             for i in range(0, len(conversations), 2):

			
 
				+#                                 # if i == 0:

			
 
				+#                                 #     conversation_obj["conversations"].append({"role": "system","content": ""})

			
 
				+#                                 conversation_obj["conversations"].append(

			
 
				+#                                     {"role": "user", "content": conversations[i].lstrip("问：")})

			
 
				+#                                 conversation_obj["conversations"].append(

			
 
				+#                                     {"role": "assistant", "content": conversations[i + 1].lstrip("答：")})

			
 
				+#                             all_conversations.append(conversation_obj)

			
 
				+#                         except json.JSONDecodeError:

			
 
				+#                             print("Error: Invalid JSON format in file:", input_file)

			
 
				+#                             continue

			
 
				+

			
 
				+#     # Write special lines to a separate JSON file

			
 
				+#     if special_lines:

			
 
				+#         special_file = '/Users/yushanghui/hongshantianping/ai训练/data/lineJson/special_file.json'  # Define the path for the special file

			
 
				+#         with open(special_file, 'w', encoding='utf-8') as special_out:

			
 
				+#             for line in special_lines:

			
 
				+#                 special_out.write(line)

			
 
				+

			
 
				+#     return all_conversations

			
 
				+

			
 
				+

			
 
				+# # 指定输入文件夹

			
 
				+# input_folder = '/Users/yushanghui/hongshantianping/ai训练/data/optimizationData'

			
 
				+

			
 
				+# all_conversations = convert_json_files(input_folder)

			
 
				+

			
 
				+# # 指定输出文件

			
 
				+# output_file = '/Users/yushanghui/hongshantianping/ai训练/data/lineJson/hongshanData.json'

			
 
				+

			
 
				+# # 将所有对话写入输出文件

			
 
				+

			
 
				+# # with open(output_file, 'w', encoding='utf-8') as f_out:

			
 
				+# #     json.dump(all_conversation, f_out, ensure_ascii=False, indent=4)

			
 
				+

			
 
				+# with open(output_file, 'w', encoding='utf-8') as f_out:

			
 
				+#     for conversation_obj in all_conversations:

			
 
				+#         json.dump(conversation_obj, f_out, ensure_ascii=False)

			
 
				+#         f_out.write('\n')

			
 
				+

			
 
				+

			
 
				+

			
 
				+

			
 
				+# qwen训练样本   

			
 
				+import json

			
 
				+import os

			
 
				+import random

			
 
				+def convert_json_files(input_folder):

			
 
				+    all_conversations = []

			
 
				+    special_lines = []

			
 
				+    for root, dirs, files in os.walk(input_folder):

			
 
				+        for file_name in files:

			
 
				+            if file_name.endswith('.json'):

			
 
				+                input_file = os.path.join(root, file_name)

			
 
				+                with open(input_file, 'r', encoding='utf-8') as f_in:

			
 
				+                    for line in f_in:

			
 
				+                        if "如图所示：" in line:

			
 
				+                            special_lines.append(line)

			
 
				+                            continue

			
 
				+                        if not line.strip():

			
 
				+                            continue

			
 
				+                        try:

			
 
				+                            conversations = json.loads(line)

			
 
				+                            conversation_obj = {

			
 
				+                                "instruction": "",

			
 
				+                                "input": "",

			
 
				+                                "output": "",

			
 
				+                                "history": []

			
 
				+                            }

			
 
				+                            for i in range(0, len(conversations), 2):

			
 
				+                                # if i == 0:

			
 
				+                                #     conversation_obj["conversations"].append({"role": "system","content": ""})

			
 
				+                                if len(conversations)==2:

			
 
				+                                    conversation_obj["instruction"]=conversations[i].lstrip("问：")

			
 
				+                                    conversation_obj["output"]= conversations[i + 1].lstrip("答：")

			
 
				+                                elif len(conversations)>2:

			
 
				+                                    # todo 如果当前 i小于len(conversations)的最后两个执行下面的操作

			
 
				+                                    if i < len(conversations)-2:

			
 
				+                                        history = []

			
 
				+                                        history.append(conversations[i].lstrip("问："))

			
 
				+                                        history.append(conversations[i + 1].lstrip("答："))

			
 
				+                                        conversation_obj["history"].append(history)

			
 
				+                                    # todo 如果当前 i等于len(conversations)的最后两个元素执行下面的操作

			
 
				+                                    elif i == len(conversations) - 2:  # 倒数第二个对话

			
 
				+                                        conversation_obj["instruction"]=conversations[i].lstrip("问：")

			
 
				+                                        conversation_obj["output"]= conversations[i + 1].lstrip("答：")

			
 
				+                            all_conversations.append(conversation_obj)

			
 
				+                                        

			
 
				+                        except json.JSONDecodeError:

			
 
				+                            print("Error: Invalid JSON format in file:", input_file)

			
 
				+                            continue

			
 
				+

			
 
				+    # Write special lines to a separate JSON file

			
 
				+    if special_lines:

			
 
				+        special_file = '/Users/yushanghui/hongshantianping/ai训练/data/lineJson/special_file.json'  # Define the path for the special file

			
 
				+        with open(special_file, 'w', encoding='utf-8') as special_out:

			
 
				+            for line in special_lines:

			
 
				+                special_out.write(line)

			
 
				+

			
 
				+    return all_conversations

			
 
				+

			
 
				+

			
 
				+# 指定输入文件夹

			
 
				+input_folder = '/Users/yushanghui/hongshantianping/ai训练/data/孙浩prompt'

			
 
				+

			
 
				+all_conversations = convert_json_files(input_folder)

			
 
				+print(len(all_conversations))

			
 
				+# 指定输出文件

			
 
				+output_file = '/Users/yushanghui/hongshantianping/ai训练/data/lineJson/merged.json'

			
 
				+

			
 
				+# 将所有对话写入输出文件 随机打乱数据

			
 
				+random.shuffle(all_conversations)

			
 
				+

			
 
				+with open(output_file, 'w', encoding='utf-8') as f_out:

			
 
				+    json.dump(all_conversations, f_out, ensure_ascii=False, indent=4)

			
 
				+

			
 
				+# with open(output_file, 'w', encoding='utf-8') as f_out:

			
 
				+#     for conversation_obj in all_conversations:

			
 
				+#         json.dump(conversation_obj, f_out, ensure_ascii=False)

			
 
				+#         f_out.write('\n')

			
--- a/jiexiExcel.py
+++ b/jiexiExcel.py
@@ -0,0 +1,60 @@
 
				+import os
			
 
				+import xlrd
			
 
				+import json
			
 
				+
			
 
				+def parse_excel_files(directory):
			
 
				+    result = []
			
 
				+    # 遍历指定目录下的所有文件和文件夹
			
 
				+    for root, dirs, files in os.walk(directory):
			
 
				+        for file in files:
			
 
				+            # 仅处理扩展名为 .xlsx 或 .xls 的文件
			
 
				+            if file.endswith('.xlsx') or file.endswith('.xls'):
			
 
				+                file_path = os.path.join(root, file)
			
 
				+                
			
 
				+                # 使用 xlrd 打开 Excel 文件
			
 
				+                workbook = xlrd.open_workbook(file_path)
			
 
				+                # 遍历每个 sheet
			
 
				+                for sheet in workbook.sheets():
			
 
				+                    rows = sheet.nrows
			
 
				+                    for row in range(rows):
			
 
				+                        row_data = []
			
 
				+                        # 跳过表头
			
 
				+                        if row == 0:
			
 
				+                            continue
			
 
				+                        # 跳过空行
			
 
				+                        if all(sheet.cell_value(row, col) == '' for col in range(sheet.ncols)):
			
 
				+                            continue
			
 
				+                        # 遍历每一列
			
 
				+                        for col in range(sheet.ncols):
			
 
				+                            # 获取表头
			
 
				+                            header = sheet.cell_value(0, col)
			
 
				+                            # 获取单元格值
			
 
				+                            cell_value = sheet.cell_value(row, col)
			
 
				+                            if cell_value:
			
 
				+                                if header=='问':
			
 
				+                                    cell_value= str('问：'+cell_value)
			
 
				+                                    row_data.append(cell_value)
			
 
				+                                if header=='答':
			
 
				+                                    cell_value= str('答：'+cell_value)
			
 
				+                                    row_data.append(cell_value)
			
 
				+                        # 将有效行数据添加到结果中
			
 
				+                        if row_data:
			
 
				+                            result.append(row_data)
			
 
				+    return result
			
 
				+
			
 
				+def write_data_to_file(data, output_file):
			
 
				+    with open(output_file, 'w', encoding='utf-8') as f:
			
 
				+        # 遍历每行内容
			
 
				+        for row in data:
			
 
				+            # 使用json.dumps将行数据转换为JSON格式的字符串
			
 
				+            json_row = json.dumps(row, ensure_ascii=False)
			
 
				+            # 写入文件并添加换行符
			
 
				+            f.write(json_row + '\n')
			
 
				+
			
 
				+# 指定目录路径和输出文件路径
			
 
				+input_directory = '/Users/yushanghui/Desktop/比较接近水务的书籍/公众号'
			
 
				+output_file = '/Users/yushanghui/Desktop/比较接近水务的书籍/公众号/all_out.json'
			
 
				+
			
 
				+# 解析 Excel 文件并写入数据到文件
			
 
				+excel_data = parse_excel_files(input_directory)
			
 
				+write_data_to_file(excel_data, output_file)
			
--- a/merge.py
+++ b/merge.py
@@ -0,0 +1,28 @@
 
				+import os
			
 
				+import json
			
 
				+import random
			
 
				+
			
 
				+def merge_json_files(input_folder, output_file):
			
 
				+    all_data = []
			
 
				+
			
 
				+    # 遍历输入文件夹中的所有文件
			
 
				+    for filename in os.listdir(input_folder):
			
 
				+        if filename.endswith('.json'):
			
 
				+            file_path = os.path.join(input_folder, filename)
			
 
				+            with open(file_path, 'r', encoding='utf-8') as f:
			
 
				+                data = json.load(f)
			
 
				+                all_data.extend(data)
			
 
				+                
			
 
				+    # 随机化数据顺序
			
 
				+    random.shuffle(all_data)
			
 
				+
			
 
				+    # 将所有数据写入到输出文件中
			
 
				+    with open(output_file, 'w', encoding='utf-8') as f_out:
			
 
				+        json.dump(all_data, f_out, ensure_ascii=False, indent=2)
			
 
				+
			
 
				+# 设置输入文件夹和输出文件名
			
 
				+input_folder = '../excel/决策/out/'  # 输入文件夹路径
			
 
				+output_file = '../excel/决策/out/merged.json'  # 输出文件名
			
 
				+
			
 
				+# 调用函数合并 JSON 文件
			
 
				+merge_json_files(input_folder, output_file)