ai-model
/
dataTools


			
				
					
						
						
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677
							import sys
import os
import re
import textract
import json
import shutil

docx_file_list = []

def list_dir_files(dir_path):
    dir_files = os.listdir(dir_path)  # 得到该文件夹下所有的文件
    for file in dir_files:
        file_path = os.path.join(dir_path, file)  # 路径拼接成绝对路径
        if os.path.isfile(file_path) and (file_path.endswith(".doc") or file_path.endswith(".docx")):  # 如果是文件，并且是.doc或.docx文件
            docx_file_list.append(file_path)
        if os.path.isdir(file_path):  # 如果是目录，就递归子目录
            list_dir_files(file_path)

def save_new_txt(dst_folder, txt_file, chat_list):
    dst_file = os.path.join(dst_folder, txt_file)
    with open(dst_file, 'w', encoding='utf-8') as fw:
        for line in chat_list:
            fw.write(line + '\n')

def read_doc_file(file_path):
    try:
        content = textract.process(file_path).decode('utf-8', errors='replace')
        return content
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return ""

def main(argv):
    input_dst = '/Users/yushanghui/hongshantianping/ai训练/data/公文分类与标注/'  # 修改为你的输入文件夹路径
    dst_folder = '/Users/yushanghui/hongshantianping/ai训练/data/公文分类与标注_res/'  # 修改为你的输出文件夹路径
    output_file = '/Users/yushanghui/hongshantianping/ai训练/data/gongwen.json' # 修改为你的json文件路径
    
    # 清理 dst_folder 目录
    if os.path.exists(dst_folder):
        shutil.rmtree(dst_folder)
    os.makedirs(dst_folder)

    jsonData = []
    
    list_dir_files(input_dst)
    print(f"Found {len(docx_file_list)} .doc/.docx files")

    for docx_file in docx_file_list:
        docx_file_name = os.path.basename(docx_file)
        txt_file_name = re.sub(r'\.docx?$', '.txt', docx_file_name)  # 统一处理.doc和.docx后缀
        txt_file_name = re.sub(' +', '', txt_file_name)  # 去掉文件名中的空格
        
        txt_file_path = os.path.join(dst_folder, txt_file_name)
        txt_lines = []
        if os.path.exists(txt_file_path):
            txt_lines = open(txt_file_path, 'r', encoding='utf-8').readlines()
        if os.path.exists(txt_file_path) and len(txt_lines) > 1:
            print(f"docx2txt: exists: txt_file_name: {txt_file_name}, lines={len(txt_lines)}")
            continue

        content = read_doc_file(docx_file)
        page_contents = content.splitlines()
        txt_data = {
            'file_name': txt_file_name.replace('.txt', ''),
            'des':'',
            'keywords':'',
            'type':'',
            'content':content
            }
        jsonData.append(txt_data)
        save_new_txt(dst_folder, txt_file_name, page_contents)
        
    with open(output_file, 'w', encoding='utf-8') as f_out:
        json.dump(jsonData, f_out, ensure_ascii=False, indent=4)
    
if __name__ == '__main__':
    main(sys.argv)