checkBad.py 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126
  1. import json
  2. import os
  3. import re
  4. def is_odd(number):
  5. return number % 2 == 1
  6. def convert_json_files(input_file, noloop=False):
  7. filter_lines = []
  8. bad_lines = []
  9. all_wen = set() # 使用集合存储已经出现的元素,以提高查找效率
  10. loop_lines = [] # 用于记录重复出现的条目
  11. short_lines = [] # 存储比较短的集合
  12. odd_number = []
  13. reg_lines = [] # 根据过滤规则筛选
  14. mingan = [] # 敏感词
  15. # for root, dirs, files in os.walk(input_folder):
  16. # for file_name in files:
  17. # if file_name.endswith('.json'):
  18. # input_file = os.path.join(root, file_name)
  19. with open(input_file, 'r', encoding='utf-8') as f_in:
  20. for item in f_in:
  21. # 解析JSON字符串
  22. try:
  23. json_obj = json.loads(item)
  24. except json.JSONDecodeError:
  25. # JSON解析错误,将其视为"坏行"
  26. bad_lines.append(item)
  27. continue
  28. # 获取JSON对象的第一个元素
  29. first_element = json_obj[0] if json_obj else None
  30. if first_element is None:
  31. # 如果JSON对象为空,将其视为"坏行"
  32. bad_lines.append(item)
  33. continue
  34. # 判断奇数 不是成对的内容
  35. if is_odd(len(json_obj)):
  36. odd_number.append(item)
  37. continue
  38. # 正则规则匹配 包含 :--: 或者 \\frac 或者\\mathrm 或者\\text ---- 或者:---:
  39. keywords = [":--:", "\\frac", "\\mathrm", "\\text", ":---:", "----"]
  40. if any(keyword in item for keyword in keywords):
  41. reg_lines.append(item)
  42. continue
  43. if len(json_obj) > 2:
  44. reg_lines.append(item)
  45. continue
  46. pattern = r"(金山)|(银川)|(福州)|(沈阳)|(台北)|(邯郸)|(德州)|(苏州)|(南京)"
  47. if re.search(pattern, item):
  48. mingan.append(item)
  49. continue
  50. if "如图所示" in item and len(item) < 100 or "详见图" in item or len(json_obj) < 2:
  51. # 如果第一个元素包含"如图所示",将其视为"坏行"
  52. bad_lines.append(item)
  53. continue
  54. if first_element in all_wen and noloop == False:
  55. # 如果第一个元素已经出现过,将其视为重复出现的条目
  56. loop_lines.append(item)
  57. continue
  58. if len(item) < 100:
  59. # 如果集合长度小于100,将其视为比较短的集合
  60. short_lines.append(item)
  61. continue
  62. # 添加第一个元素到已经出现的元素集合中
  63. all_wen.add(first_element)
  64. filter_lines.append(item)
  65. result = {
  66. "bad": bad_lines,
  67. "good": filter_lines,
  68. "loop": loop_lines,
  69. "short": short_lines,
  70. 'odd': odd_number,
  71. 'reg_lines': reg_lines,
  72. "mingan": mingan
  73. }
  74. print(f'''bad_lines数量{len(bad_lines)},
  75. filter_lines{len(filter_lines)},
  76. loop_lines数量{len(loop_lines)},
  77. short_lines数量{len(short_lines)},
  78. odd_number数量{len(odd_number)},
  79. reg_lines数量{len(reg_lines)},
  80. mingan数量{len(mingan)}
  81. ''')
  82. return result
  83. def write_files(out_folder, result_data, merge=None):
  84. # 确保输出文件夹存在,如果不存在则创建
  85. if not os.path.exists(out_folder):
  86. os.makedirs(out_folder)
  87. # 循环遍历result_data对象
  88. for key, value in result_data.items():
  89. # 构建输出文件路径
  90. output_file_path = os.path.join(out_folder, f"{key}.json")
  91. # 将值作为JSON格式写入对应的文件中
  92. with open(output_file_path, 'w', encoding='utf-8') as f_out:
  93. if merge is not None:
  94. f_out.write(json.dumps(value, ensure_ascii=False, indent=2))
  95. else:
  96. for line in value:
  97. f_out.write(line)
  98. def beginCheckBadFun(input_file, out_file, noloop=False):
  99. # 调用函数进行转换
  100. result_data = convert_json_files(input_file, noloop)
  101. write_files(out_file, result_data)
  102. if __name__ == "__main__":
  103. # 指定输入文件
  104. input_file = '/Users/yushanghui/hongshantianping/ai训练/data/shuiwuBook/json/diedaiedit6.20.json'
  105. # 指定输出文件
  106. out_file = '/Users/yushanghui/hongshantianping/ai训练/data/shuiwuBook/jsonOut/diedaiedit6.20'
  107. beginCheckBadFun(input_file, out_file)