1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556 |
- import os
- import json
- import docx
- def extract_text_from_docx(docx_file):
- try:
- doc = docx.Document(docx_file)
- text = "\n".join([para.text for para in doc.paragraphs])
- return text.strip()
- except Exception as e:
- print(f"Error extracting text from {docx_file}: {e}")
- return ""
- def extract_title_and_content(text):
- lines = text.split('\n')
- title = ""
- content = ""
- for line in lines:
- if line.strip():
- if not title:
- title = line.strip()
- else:
- content += line.strip() + "\n"
- return title, content.strip()
- def parse_documents(folder_path):
- parsed_data = []
- for file_name in os.listdir(folder_path):
- file_path = os.path.join(folder_path, file_name)
- if file_name.endswith('.docx') or file_name.endswith('.doc'):
- text = extract_text_from_docx(file_path)
- if text:
- title, content = extract_title_and_content(text)
- if not title:
- title = os.path.splitext(file_name)[0]
- parsed_data.append({"问": title, "答": content})
- else:
- continue
-
- return parsed_data
- def save_to_json(data, json_file):
- with open(json_file, 'w', encoding='utf-8') as f:
- json.dump(data, f, ensure_ascii=False, indent=4)
- # 指定文件夹路径
- folder_path = '/Users/yushanghui/Downloads/slibra-img/环保督察类/进水水质超标/'
- # 解析文件夹中的文档
- parsed_data = parse_documents(folder_path)
- # 保存到JSON文件
- json_file = '/Users/yushanghui/Downloads/slibra-img/环保督察类/进水水质超标/parsed_documents.json'
- save_to_json(parsed_data, json_file)
- print("解析完成,并已保存到", json_file)
|