ai-model
/
dataTools


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126
							import json
import os
import re


def is_odd(number):
    return number % 2 == 1


def convert_json_files(input_file, noloop=False):
    filter_lines = []
    bad_lines = []
    all_wen = set()  # 使用集合存储已经出现的元素，以提高查找效率
    loop_lines = []  # 用于记录重复出现的条目
    short_lines = []  # 存储比较短的集合
    odd_number = []
    reg_lines = []  # 根据过滤规则筛选
    mingan = []  # 敏感词
    # for root, dirs, files in os.walk(input_folder):
    #     for file_name in files:
    #         if file_name.endswith('.json'):
    #             input_file = os.path.join(root, file_name)
    with open(input_file, 'r', encoding='utf-8') as f_in:
        for item in f_in:
            # 解析JSON字符串
            try:
                json_obj = json.loads(item)
            except json.JSONDecodeError:
                # JSON解析错误，将其视为"坏行"
                bad_lines.append(item)
                continue

            # 获取JSON对象的第一个元素
            first_element = json_obj[0] if json_obj else None

            if first_element is None:
                # 如果JSON对象为空，将其视为"坏行"
                bad_lines.append(item)
                continue
            # 判断奇数 不是成对的内容
            if is_odd(len(json_obj)):
                odd_number.append(item)
                continue
            #  正则规则匹配 包含 :--: 或者 \\frac 或者\\mathrm 或者\\text ---- 或者:---:
            keywords = [":--:", "\\frac", "\\mathrm", "\\text", ":---:", "----"]
            if any(keyword in item for keyword in keywords):
                reg_lines.append(item)
                continue
            if len(json_obj) > 2:
                reg_lines.append(item)
                continue
            pattern = r"(金山)|(银川)|(福州)|(沈阳)|(台北)|(邯郸)|(德州)|(苏州)|(南京)"
            if re.search(pattern, item):
                mingan.append(item)
                continue
            if "如图所示" in item and len(item) < 100 or "详见图" in item or len(json_obj) < 2:
                # 如果第一个元素包含"如图所示"，将其视为"坏行"
                bad_lines.append(item)
                continue

            if first_element in all_wen and noloop == False:
                # 如果第一个元素已经出现过，将其视为重复出现的条目
                loop_lines.append(item)
                continue

            if len(item) < 100:
                # 如果集合长度小于100，将其视为比较短的集合
                short_lines.append(item)
                continue

            # 添加第一个元素到已经出现的元素集合中
            all_wen.add(first_element)
            filter_lines.append(item)

    result = {
        "bad": bad_lines,
        "good": filter_lines,
        "loop": loop_lines,
        "short": short_lines,
        'odd': odd_number,
        'reg_lines': reg_lines,
        "mingan": mingan
    }
    print(f'''bad_lines数量{len(bad_lines)},
          filter_lines{len(filter_lines)},
          loop_lines数量{len(loop_lines)},
          short_lines数量{len(short_lines)},
          odd_number数量{len(odd_number)},
          reg_lines数量{len(reg_lines)},
          mingan数量{len(mingan)}
          ''')
    return result


def write_files(out_folder, result_data, merge=None):
    # 确保输出文件夹存在，如果不存在则创建
    if not os.path.exists(out_folder):
        os.makedirs(out_folder)

    # 循环遍历result_data对象
    for key, value in result_data.items():
        # 构建输出文件路径
        output_file_path = os.path.join(out_folder, f"{key}.json")

        # 将值作为JSON格式写入对应的文件中
        with open(output_file_path, 'w', encoding='utf-8') as f_out:
            if merge is not None:
                f_out.write(json.dumps(value, ensure_ascii=False, indent=2))
            else:
                for line in value:
                    f_out.write(line)


def beginCheckBadFun(input_file, out_file, noloop=False):
    # 调用函数进行转换
    result_data = convert_json_files(input_file, noloop)
    write_files(out_file, result_data)


if __name__ == "__main__":
    # 指定输入文件
    input_file = '/Users/yushanghui/hongshantianping/ai训练/data/shuiwuBook/json/diedaiedit6.20.json'
    # 指定输出文件
    out_file = '/Users/yushanghui/hongshantianping/ai训练/data/shuiwuBook/jsonOut/diedaiedit6.20'

    beginCheckBadFun(input_file, out_file)