import sys import os import re import textract import json import shutil docx_file_list = [] def list_dir_files(dir_path): dir_files = os.listdir(dir_path) # 得到该文件夹下所有的文件 for file in dir_files: file_path = os.path.join(dir_path, file) # 路径拼接成绝对路径 if os.path.isfile(file_path) and (file_path.endswith(".doc") or file_path.endswith(".docx")): # 如果是文件,并且是.doc或.docx文件 docx_file_list.append(file_path) if os.path.isdir(file_path): # 如果是目录,就递归子目录 list_dir_files(file_path) def save_new_txt(dst_folder, txt_file, chat_list): dst_file = os.path.join(dst_folder, txt_file) with open(dst_file, 'w', encoding='utf-8') as fw: for line in chat_list: fw.write(line + '\n') def read_doc_file(file_path): try: content = textract.process(file_path).decode('utf-8', errors='replace') return content except Exception as e: print(f"Error reading {file_path}: {e}") return "" def main(argv): input_dst = '/Users/yushanghui/hongshantianping/ai训练/data/公文分类与标注/' # 修改为你的输入文件夹路径 dst_folder = '/Users/yushanghui/hongshantianping/ai训练/data/公文分类与标注_res/' # 修改为你的输出文件夹路径 output_file = '/Users/yushanghui/hongshantianping/ai训练/data/gongwen.json' # 修改为你的json文件路径 # 清理 dst_folder 目录 if os.path.exists(dst_folder): shutil.rmtree(dst_folder) os.makedirs(dst_folder) jsonData = [] list_dir_files(input_dst) print(f"Found {len(docx_file_list)} .doc/.docx files") for docx_file in docx_file_list: docx_file_name = os.path.basename(docx_file) txt_file_name = re.sub(r'\.docx?$', '.txt', docx_file_name) # 统一处理.doc和.docx后缀 txt_file_name = re.sub(' +', '', txt_file_name) # 去掉文件名中的空格 txt_file_path = os.path.join(dst_folder, txt_file_name) txt_lines = [] if os.path.exists(txt_file_path): txt_lines = open(txt_file_path, 'r', encoding='utf-8').readlines() if os.path.exists(txt_file_path) and len(txt_lines) > 1: print(f"docx2txt: exists: txt_file_name: {txt_file_name}, lines={len(txt_lines)}") continue content = read_doc_file(docx_file) page_contents = content.splitlines() txt_data = { 'file_name': txt_file_name.replace('.txt', ''), 'des':'', 'keywords':'', 'type':'', 'content':content } jsonData.append(txt_data) save_new_txt(dst_folder, txt_file_name, page_contents) with open(output_file, 'w', encoding='utf-8') as f_out: json.dump(jsonData, f_out, ensure_ascii=False, indent=4) if __name__ == '__main__': main(sys.argv)