1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677 |
- import sys
- import os
- import re
- import textract
- import json
- import shutil
- docx_file_list = []
- def list_dir_files(dir_path):
- dir_files = os.listdir(dir_path) # 得到该文件夹下所有的文件
- for file in dir_files:
- file_path = os.path.join(dir_path, file) # 路径拼接成绝对路径
- if os.path.isfile(file_path) and (file_path.endswith(".doc") or file_path.endswith(".docx")): # 如果是文件,并且是.doc或.docx文件
- docx_file_list.append(file_path)
- if os.path.isdir(file_path): # 如果是目录,就递归子目录
- list_dir_files(file_path)
- def save_new_txt(dst_folder, txt_file, chat_list):
- dst_file = os.path.join(dst_folder, txt_file)
- with open(dst_file, 'w', encoding='utf-8') as fw:
- for line in chat_list:
- fw.write(line + '\n')
- def read_doc_file(file_path):
- try:
- content = textract.process(file_path).decode('utf-8', errors='replace')
- return content
- except Exception as e:
- print(f"Error reading {file_path}: {e}")
- return ""
- def main(argv):
- input_dst = '/Users/yushanghui/hongshantianping/ai训练/data/公文分类与标注/' # 修改为你的输入文件夹路径
- dst_folder = '/Users/yushanghui/hongshantianping/ai训练/data/公文分类与标注_res/' # 修改为你的输出文件夹路径
- output_file = '/Users/yushanghui/hongshantianping/ai训练/data/gongwen.json' # 修改为你的json文件路径
-
- # 清理 dst_folder 目录
- if os.path.exists(dst_folder):
- shutil.rmtree(dst_folder)
- os.makedirs(dst_folder)
- jsonData = []
-
- list_dir_files(input_dst)
- print(f"Found {len(docx_file_list)} .doc/.docx files")
- for docx_file in docx_file_list:
- docx_file_name = os.path.basename(docx_file)
- txt_file_name = re.sub(r'\.docx?$', '.txt', docx_file_name) # 统一处理.doc和.docx后缀
- txt_file_name = re.sub(' +', '', txt_file_name) # 去掉文件名中的空格
-
- txt_file_path = os.path.join(dst_folder, txt_file_name)
- txt_lines = []
- if os.path.exists(txt_file_path):
- txt_lines = open(txt_file_path, 'r', encoding='utf-8').readlines()
- if os.path.exists(txt_file_path) and len(txt_lines) > 1:
- print(f"docx2txt: exists: txt_file_name: {txt_file_name}, lines={len(txt_lines)}")
- continue
- content = read_doc_file(docx_file)
- page_contents = content.splitlines()
- txt_data = {
- 'file_name': txt_file_name.replace('.txt', ''),
- 'des':'',
- 'keywords':'',
- 'type':'',
- 'content':content
- }
- jsonData.append(txt_data)
- save_new_txt(dst_folder, txt_file_name, page_contents)
-
- with open(output_file, 'w', encoding='utf-8') as f_out:
- json.dump(jsonData, f_out, ensure_ascii=False, indent=4)
-
- if __name__ == '__main__':
- main(sys.argv)
|