excel.py 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109
  1. import pandas as pd
  2. import json
  3. import re
  4. import random
  5. import os
  6. def replace_numbers(text):
  7. pattern = r'(当前(?:的)?值(?:是|为)\s*)(\d+(\.\d+)?)' # 匹配"当前的值"后面的数字
  8. def repl(match):
  9. num_str = match.group(2)
  10. num = float(num_str)
  11. if '.' in num_str:
  12. random_offset = round(random.uniform(-0.001, 0.05), 2)
  13. elif len(num_str) == 1:
  14. random_offset = round(random.uniform(-0.05, 0.05), 2)
  15. elif len(num_str) == 2:
  16. random_offset = round(random.uniform(-1.00, 1.00), 2)
  17. else:
  18. random_offset = round(random.uniform(-2, 2), 2)
  19. new_num = num + random_offset
  20. formatted_num = '{:.2f}'.format(new_num)
  21. # 将随机化后的数字应用到匹配结果中
  22. return match.group(1) + formatted_num
  23. new_text = re.sub(pattern, repl, text)
  24. return new_text
  25. def excel_to_json(excel_file_path):
  26. excel_data = pd.ExcelFile(excel_file_path)
  27. sheet_names = excel_data.sheet_names
  28. json_data = {}
  29. all_data = []
  30. for sheet_name in sheet_names:
  31. sheet_data = []
  32. df_sheet = pd.read_excel(excel_data, sheet_name=sheet_name)
  33. for _ in range(12):
  34. for _, row in df_sheet.iterrows():
  35. conversations = []
  36. system = ''
  37. check_value = ''
  38. for col_name, cell_value in row.items():
  39. if pd.isna(cell_value):
  40. continue
  41. if col_name == 'check':
  42. check_value = eval(cell_value)
  43. cell_value = re.sub(r'{{(?:export)}}', "", str(cell_value).strip()).strip()
  44. if col_name == 'system':
  45. if pd.notna(cell_value):
  46. system = replace_numbers(cell_value).format_map(check_value)
  47. else:
  48. system = ''
  49. elif col_name.startswith('问') and pd.notna(cell_value):
  50. conversations.append({"from": "human", "value": cell_value})
  51. elif col_name.startswith('答') and pd.notna(cell_value):
  52. conversations.append({"from": "gpt", "value": cell_value})
  53. if conversations:
  54. sheet_data.append({"system": system,"conversations": conversations})
  55. all_data.append({"system": system,"conversations": conversations})
  56. json_data[sheet_name] = sheet_data
  57. print(len(all_data))
  58. return json_data ,all_data
  59. def save_json_per_sheet(json_data, output_folder):
  60. for sheet_name, sheet_data in json_data.items():
  61. output_file = os.path.join(output_folder, f'{sheet_name}.json')
  62. with open(output_file, 'w', encoding='utf-8') as f_out:
  63. json.dump(sheet_data, f_out, ensure_ascii=False, indent=4)
  64. def save_json_all(all_data, output_file):
  65. with open(output_file, 'w', encoding='utf-8') as f_out:
  66. json.dump(all_data, f_out, ensure_ascii=False, indent=4)
  67. # Excel 文件路径
  68. excel_file_path = './excel/决策标注模版4.0_demo.xlsx'
  69. # 输出 JSON 文件夹路径
  70. output_folder = './excel/demo'
  71. # 确保输出文件夹存在,如果不存在则创建
  72. os.makedirs(output_folder, exist_ok=True)
  73. # 将 Excel 转换为 JSON 数据
  74. json_data,all_data = excel_to_json(excel_file_path)
  75. # 将每个 sheet 的 JSON 数据保存为单独的 JSON 文件
  76. save_json_per_sheet(json_data, output_folder)
  77. # 输出 JSON 文件
  78. output_file = './public/jueceDemo.json'
  79. save_json_all(all_data, output_file)