wordToTxt.py 1.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556
  1. import os
  2. import json
  3. import docx
  4. def extract_text_from_docx(docx_file):
  5. try:
  6. doc = docx.Document(docx_file)
  7. text = "\n".join([para.text for para in doc.paragraphs])
  8. return text.strip()
  9. except Exception as e:
  10. print(f"Error extracting text from {docx_file}: {e}")
  11. return ""
  12. def extract_title_and_content(text):
  13. lines = text.split('\n')
  14. title = ""
  15. content = ""
  16. for line in lines:
  17. if line.strip():
  18. if not title:
  19. title = line.strip()
  20. else:
  21. content += line.strip() + "\n"
  22. return title, content.strip()
  23. def parse_documents(folder_path):
  24. parsed_data = []
  25. for file_name in os.listdir(folder_path):
  26. file_path = os.path.join(folder_path, file_name)
  27. if file_name.endswith('.docx') or file_name.endswith('.doc'):
  28. text = extract_text_from_docx(file_path)
  29. if text:
  30. title, content = extract_title_and_content(text)
  31. if not title:
  32. title = os.path.splitext(file_name)[0]
  33. parsed_data.append({"问": title, "答": content})
  34. else:
  35. continue
  36. return parsed_data
  37. def save_to_json(data, json_file):
  38. with open(json_file, 'w', encoding='utf-8') as f:
  39. json.dump(data, f, ensure_ascii=False, indent=4)
  40. # 指定文件夹路径
  41. folder_path = '/Users/yushanghui/Downloads/slibra-img/环保督察类/进水水质超标/'
  42. # 解析文件夹中的文档
  43. parsed_data = parse_documents(folder_path)
  44. # 保存到JSON文件
  45. json_file = '/Users/yushanghui/Downloads/slibra-img/环保督察类/进水水质超标/parsed_documents.json'
  46. save_to_json(parsed_data, json_file)
  47. print("解析完成,并已保存到", json_file)