123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109 |
- import pandas as pd
- import json
- import re
- import random
- import os
- def replace_numbers(text):
- pattern = r'(当前(?:的)?值(?:是|为)\s*)(\d+(\.\d+)?)' # 匹配"当前的值"后面的数字
- def repl(match):
- num_str = match.group(2)
- num = float(num_str)
-
- if '.' in num_str:
- random_offset = round(random.uniform(-0.001, 0.05), 2)
- elif len(num_str) == 1:
- random_offset = round(random.uniform(-0.05, 0.05), 2)
- elif len(num_str) == 2:
- random_offset = round(random.uniform(-1.00, 1.00), 2)
- else:
- random_offset = round(random.uniform(-2, 2), 2)
-
- new_num = num + random_offset
- formatted_num = '{:.2f}'.format(new_num)
-
- # 将随机化后的数字应用到匹配结果中
- return match.group(1) + formatted_num
- new_text = re.sub(pattern, repl, text)
- return new_text
- def excel_to_json(excel_file_path):
- excel_data = pd.ExcelFile(excel_file_path)
- sheet_names = excel_data.sheet_names
-
- json_data = {}
- all_data = []
-
- for sheet_name in sheet_names:
- sheet_data = []
- df_sheet = pd.read_excel(excel_data, sheet_name=sheet_name)
-
-
- for _ in range(12):
- for _, row in df_sheet.iterrows():
- conversations = []
- system = ''
- check_value = ''
- for col_name, cell_value in row.items():
- if pd.isna(cell_value):
- continue
-
- if col_name == 'check':
- check_value = eval(cell_value)
- cell_value = re.sub(r'{{(?:export)}}', "", str(cell_value).strip()).strip()
- if col_name == 'system':
- if pd.notna(cell_value):
- system = replace_numbers(cell_value).format_map(check_value)
- else:
- system = ''
-
- elif col_name.startswith('问') and pd.notna(cell_value):
- conversations.append({"from": "human", "value": cell_value})
- elif col_name.startswith('答') and pd.notna(cell_value):
- conversations.append({"from": "gpt", "value": cell_value})
-
- if conversations:
- sheet_data.append({"system": system,"conversations": conversations})
- all_data.append({"system": system,"conversations": conversations})
-
- json_data[sheet_name] = sheet_data
- print(len(all_data))
- return json_data ,all_data
- def save_json_per_sheet(json_data, output_folder):
- for sheet_name, sheet_data in json_data.items():
- output_file = os.path.join(output_folder, f'{sheet_name}.json')
- with open(output_file, 'w', encoding='utf-8') as f_out:
- json.dump(sheet_data, f_out, ensure_ascii=False, indent=4)
- def save_json_all(all_data, output_file):
- with open(output_file, 'w', encoding='utf-8') as f_out:
- json.dump(all_data, f_out, ensure_ascii=False, indent=4)
-
- # Excel 文件路径
- excel_file_path = './excel/决策标注模版4.0_demo.xlsx'
- # 输出 JSON 文件夹路径
- output_folder = './excel/demo'
- # 确保输出文件夹存在,如果不存在则创建
- os.makedirs(output_folder, exist_ok=True)
- # 将 Excel 转换为 JSON 数据
- json_data,all_data = excel_to_json(excel_file_path)
- # 将每个 sheet 的 JSON 数据保存为单独的 JSON 文件
- save_json_per_sheet(json_data, output_folder)
- # 输出 JSON 文件
- output_file = './public/jueceDemo.json'
- save_json_all(all_data, output_file)
|