import json import os import re def is_odd(number): return number % 2 == 1 def convert_json_files(input_file, noloop=False): filter_lines = [] bad_lines = [] all_wen = set() # 使用集合存储已经出现的元素,以提高查找效率 loop_lines = [] # 用于记录重复出现的条目 short_lines = [] # 存储比较短的集合 odd_number = [] reg_lines = [] # 根据过滤规则筛选 mingan = [] # 敏感词 # for root, dirs, files in os.walk(input_folder): # for file_name in files: # if file_name.endswith('.json'): # input_file = os.path.join(root, file_name) with open(input_file, 'r', encoding='utf-8') as f_in: for item in f_in: # 解析JSON字符串 try: json_obj = json.loads(item) except json.JSONDecodeError: # JSON解析错误,将其视为"坏行" bad_lines.append(item) continue # 获取JSON对象的第一个元素 first_element = json_obj[0] if json_obj else None if first_element is None: # 如果JSON对象为空,将其视为"坏行" bad_lines.append(item) continue # 判断奇数 不是成对的内容 if is_odd(len(json_obj)): odd_number.append(item) continue # 正则规则匹配 包含 :--: 或者 \\frac 或者\\mathrm 或者\\text ---- 或者:---: keywords = [":--:", "\\frac", "\\mathrm", "\\text", ":---:", "----"] if any(keyword in item for keyword in keywords): reg_lines.append(item) continue if len(json_obj) > 2: reg_lines.append(item) continue pattern = r"(金山)|(银川)|(福州)|(沈阳)|(台北)|(邯郸)|(德州)|(苏州)|(南京)" if re.search(pattern, item): mingan.append(item) continue if "如图所示" in item and len(item) < 100 or "详见图" in item or len(json_obj) < 2: # 如果第一个元素包含"如图所示",将其视为"坏行" bad_lines.append(item) continue if first_element in all_wen and noloop == False: # 如果第一个元素已经出现过,将其视为重复出现的条目 loop_lines.append(item) continue if len(item) < 100: # 如果集合长度小于100,将其视为比较短的集合 short_lines.append(item) continue # 添加第一个元素到已经出现的元素集合中 all_wen.add(first_element) filter_lines.append(item) result = { "bad": bad_lines, "good": filter_lines, "loop": loop_lines, "short": short_lines, 'odd': odd_number, 'reg_lines': reg_lines, "mingan": mingan } print(f'''bad_lines数量{len(bad_lines)}, filter_lines{len(filter_lines)}, loop_lines数量{len(loop_lines)}, short_lines数量{len(short_lines)}, odd_number数量{len(odd_number)}, reg_lines数量{len(reg_lines)}, mingan数量{len(mingan)} ''') return result def write_files(out_folder, result_data, merge=None): # 确保输出文件夹存在,如果不存在则创建 if not os.path.exists(out_folder): os.makedirs(out_folder) # 循环遍历result_data对象 for key, value in result_data.items(): # 构建输出文件路径 output_file_path = os.path.join(out_folder, f"{key}.json") # 将值作为JSON格式写入对应的文件中 with open(output_file_path, 'w', encoding='utf-8') as f_out: if merge is not None: f_out.write(json.dumps(value, ensure_ascii=False, indent=2)) else: for line in value: f_out.write(line) def beginCheckBadFun(input_file, out_file, noloop=False): # 调用函数进行转换 result_data = convert_json_files(input_file, noloop) write_files(out_file, result_data) if __name__ == "__main__": # 指定输入文件 input_file = '/Users/yushanghui/hongshantianping/ai训练/data/shuiwuBook/json/diedaiedit6.20.json' # 指定输出文件 out_file = '/Users/yushanghui/hongshantianping/ai训练/data/shuiwuBook/jsonOut/diedaiedit6.20' beginCheckBadFun(input_file, out_file)