docx_2_text.py 2.9 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677
  1. import sys
  2. import os
  3. import re
  4. import textract
  5. import json
  6. import shutil
  7. docx_file_list = []
  8. def list_dir_files(dir_path):
  9. dir_files = os.listdir(dir_path) # 得到该文件夹下所有的文件
  10. for file in dir_files:
  11. file_path = os.path.join(dir_path, file) # 路径拼接成绝对路径
  12. if os.path.isfile(file_path) and (file_path.endswith(".doc") or file_path.endswith(".docx")): # 如果是文件,并且是.doc或.docx文件
  13. docx_file_list.append(file_path)
  14. if os.path.isdir(file_path): # 如果是目录,就递归子目录
  15. list_dir_files(file_path)
  16. def save_new_txt(dst_folder, txt_file, chat_list):
  17. dst_file = os.path.join(dst_folder, txt_file)
  18. with open(dst_file, 'w', encoding='utf-8') as fw:
  19. for line in chat_list:
  20. fw.write(line + '\n')
  21. def read_doc_file(file_path):
  22. try:
  23. content = textract.process(file_path).decode('utf-8', errors='replace')
  24. return content
  25. except Exception as e:
  26. print(f"Error reading {file_path}: {e}")
  27. return ""
  28. def main(argv):
  29. input_dst = '/Users/yushanghui/hongshantianping/ai训练/data/公文分类与标注/' # 修改为你的输入文件夹路径
  30. dst_folder = '/Users/yushanghui/hongshantianping/ai训练/data/公文分类与标注_res/' # 修改为你的输出文件夹路径
  31. output_file = '/Users/yushanghui/hongshantianping/ai训练/data/gongwen.json' # 修改为你的json文件路径
  32. # 清理 dst_folder 目录
  33. if os.path.exists(dst_folder):
  34. shutil.rmtree(dst_folder)
  35. os.makedirs(dst_folder)
  36. jsonData = []
  37. list_dir_files(input_dst)
  38. print(f"Found {len(docx_file_list)} .doc/.docx files")
  39. for docx_file in docx_file_list:
  40. docx_file_name = os.path.basename(docx_file)
  41. txt_file_name = re.sub(r'\.docx?$', '.txt', docx_file_name) # 统一处理.doc和.docx后缀
  42. txt_file_name = re.sub(' +', '', txt_file_name) # 去掉文件名中的空格
  43. txt_file_path = os.path.join(dst_folder, txt_file_name)
  44. txt_lines = []
  45. if os.path.exists(txt_file_path):
  46. txt_lines = open(txt_file_path, 'r', encoding='utf-8').readlines()
  47. if os.path.exists(txt_file_path) and len(txt_lines) > 1:
  48. print(f"docx2txt: exists: txt_file_name: {txt_file_name}, lines={len(txt_lines)}")
  49. continue
  50. content = read_doc_file(docx_file)
  51. page_contents = content.splitlines()
  52. txt_data = {
  53. 'file_name': txt_file_name.replace('.txt', ''),
  54. 'des':'',
  55. 'keywords':'',
  56. 'type':'',
  57. 'content':content
  58. }
  59. jsonData.append(txt_data)
  60. save_new_txt(dst_folder, txt_file_name, page_contents)
  61. with open(output_file, 'w', encoding='utf-8') as f_out:
  62. json.dump(jsonData, f_out, ensure_ascii=False, indent=4)
  63. if __name__ == '__main__':
  64. main(sys.argv)