import os import json import docx def extract_text_from_docx(docx_file): try: doc = docx.Document(docx_file) text = "\n".join([para.text for para in doc.paragraphs]) return text.strip() except Exception as e: print(f"Error extracting text from {docx_file}: {e}") return "" def extract_title_and_content(text): lines = text.split('\n') title = "" content = "" for line in lines: if line.strip(): if not title: title = line.strip() else: content += line.strip() + "\n" return title, content.strip() def parse_documents(folder_path): parsed_data = [] for file_name in os.listdir(folder_path): file_path = os.path.join(folder_path, file_name) if file_name.endswith('.docx') or file_name.endswith('.doc'): text = extract_text_from_docx(file_path) if text: title, content = extract_title_and_content(text) if not title: title = os.path.splitext(file_name)[0] parsed_data.append({"问": title, "答": content}) else: continue return parsed_data def save_to_json(data, json_file): with open(json_file, 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=4) # 指定文件夹路径 folder_path = '/Users/yushanghui/Downloads/slibra-img/环保督察类/进水水质超标/' # 解析文件夹中的文档 parsed_data = parse_documents(folder_path) # 保存到JSON文件 json_file = '/Users/yushanghui/Downloads/slibra-img/环保督察类/进水水质超标/parsed_documents.json' save_to_json(parsed_data, json_file) print("解析完成,并已保存到", json_file)