123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126 |
- import json
- import os
- import re
- def is_odd(number):
- return number % 2 == 1
- def convert_json_files(input_file, noloop=False):
- filter_lines = []
- bad_lines = []
- all_wen = set() # 使用集合存储已经出现的元素,以提高查找效率
- loop_lines = [] # 用于记录重复出现的条目
- short_lines = [] # 存储比较短的集合
- odd_number = []
- reg_lines = [] # 根据过滤规则筛选
- mingan = [] # 敏感词
- # for root, dirs, files in os.walk(input_folder):
- # for file_name in files:
- # if file_name.endswith('.json'):
- # input_file = os.path.join(root, file_name)
- with open(input_file, 'r', encoding='utf-8') as f_in:
- for item in f_in:
- # 解析JSON字符串
- try:
- json_obj = json.loads(item)
- except json.JSONDecodeError:
- # JSON解析错误,将其视为"坏行"
- bad_lines.append(item)
- continue
- # 获取JSON对象的第一个元素
- first_element = json_obj[0] if json_obj else None
- if first_element is None:
- # 如果JSON对象为空,将其视为"坏行"
- bad_lines.append(item)
- continue
- # 判断奇数 不是成对的内容
- if is_odd(len(json_obj)):
- odd_number.append(item)
- continue
- # 正则规则匹配 包含 :--: 或者 \\frac 或者\\mathrm 或者\\text ---- 或者:---:
- keywords = [":--:", "\\frac", "\\mathrm", "\\text", ":---:", "----"]
- if any(keyword in item for keyword in keywords):
- reg_lines.append(item)
- continue
- if len(json_obj) > 2:
- reg_lines.append(item)
- continue
- pattern = r"(金山)|(银川)|(福州)|(沈阳)|(台北)|(邯郸)|(德州)|(苏州)|(南京)"
- if re.search(pattern, item):
- mingan.append(item)
- continue
- if "如图所示" in item and len(item) < 100 or "详见图" in item or len(json_obj) < 2:
- # 如果第一个元素包含"如图所示",将其视为"坏行"
- bad_lines.append(item)
- continue
- if first_element in all_wen and noloop == False:
- # 如果第一个元素已经出现过,将其视为重复出现的条目
- loop_lines.append(item)
- continue
- if len(item) < 100:
- # 如果集合长度小于100,将其视为比较短的集合
- short_lines.append(item)
- continue
- # 添加第一个元素到已经出现的元素集合中
- all_wen.add(first_element)
- filter_lines.append(item)
- result = {
- "bad": bad_lines,
- "good": filter_lines,
- "loop": loop_lines,
- "short": short_lines,
- 'odd': odd_number,
- 'reg_lines': reg_lines,
- "mingan": mingan
- }
- print(f'''bad_lines数量{len(bad_lines)},
- filter_lines{len(filter_lines)},
- loop_lines数量{len(loop_lines)},
- short_lines数量{len(short_lines)},
- odd_number数量{len(odd_number)},
- reg_lines数量{len(reg_lines)},
- mingan数量{len(mingan)}
- ''')
- return result
- def write_files(out_folder, result_data, merge=None):
- # 确保输出文件夹存在,如果不存在则创建
- if not os.path.exists(out_folder):
- os.makedirs(out_folder)
- # 循环遍历result_data对象
- for key, value in result_data.items():
- # 构建输出文件路径
- output_file_path = os.path.join(out_folder, f"{key}.json")
- # 将值作为JSON格式写入对应的文件中
- with open(output_file_path, 'w', encoding='utf-8') as f_out:
- if merge is not None:
- f_out.write(json.dumps(value, ensure_ascii=False, indent=2))
- else:
- for line in value:
- f_out.write(line)
- def beginCheckBadFun(input_file, out_file, noloop=False):
- # 调用函数进行转换
- result_data = convert_json_files(input_file, noloop)
- write_files(out_file, result_data)
- if __name__ == "__main__":
- # 指定输入文件
- input_file = '/Users/yushanghui/hongshantianping/ai训练/data/shuiwuBook/json/diedaiedit6.20.json'
- # 指定输出文件
- out_file = '/Users/yushanghui/hongshantianping/ai训练/data/shuiwuBook/jsonOut/diedaiedit6.20'
- beginCheckBadFun(input_file, out_file)
|